In [73]:
from data import FaceDataset 
from torch.utils.data import DataLoader, Dataset, Subset
from torchvision import transforms
import torch
import pandas as pd
from PIL import Image
import os
import numpy as np

In [3]:
df_path = 'norm_bounding_boxes.csv'
# Initialize dataset and dataloader

In [76]:

class FaceDataset(Dataset):
    def __init__(self, df_path, image_dir):
        self.dataframe = pd.read_csv(df_path)
        self.image_dir = image_dir

        # Transformations
        self.transform = transforms.Compose([
            transforms.Resize((92, 84)),
            transforms.ToTensor()
        ])
    
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        img_name = os.path.join(self.image_dir, self.dataframe.iloc[idx, 0])
        image = Image.open(img_name)
        bbox = self.dataframe.iloc[idx, 1:5].values
        face_crop = image.crop((bbox[0], bbox[1], bbox[2], bbox[3]))

        
        if self.transform:
            face_crop = self.transform(face_crop)

        to_tensor = transforms.ToTensor()
        image = to_tensor(image)
        
        bbox = np.array(bbox, dtype=int)
        
        return image, face_crop, torch.tensor(bbox)
    
dataset = FaceDataset(df_path, '../celeba/img_align_celeba')
dataloader = DataLoader(dataset, batch_size=25, shuffle=False)

In [79]:
# for x in dataloader:
#     print(x[0].shape, x[1].shape, x[2].shape)
#     break
 # Initialize dataset and dataloader

full_dataset = FaceDataset('norm_bounding_boxes.csv', '../celeba/img_align_celeba')

# Create a subset of the full dataset (100 samples)
indices = np.random.permutation(len(dataset))[:100]
print(indices)
dataset = Subset(full_dataset, indices) 

# dataset = FaceDataset('norm_bounding_boxes.csv', '../celeba/img_align_celeba')
dataloader = DataLoader(dataset, batch_size=25, shuffle=False)

# for epoch in range(num_epochs):
#     total_fg_loss = 0.0
#     total_bg_loss = 0.0
for x in dataloader:
    print(x)
    break

[3706 8414  415 5673 1754 3911 4036 7745  578 9753 6454 5737 8349 2997
  587 9785 4349 9435  809 8517 9639 8372 5475 7011 9527 1788 3220 5148
 5644 1157 8917 1796 7851 3555 7375 4327 3705 7945 6212 3548 3051 5686
 7364 2870 8302 1328 3124 8519 2319 4754 1670  384 3757 3468 8489 7740
 5375 5586 9367 5940 8671 9359 1822 1154 4175 8141 9340  377 1981 2009
 2564 6486 4303 8169 5641 9163 8086 8478 6582 9224  834 3754  389 8938
  411 1773 6050  433 1425 1388 8996 1138 3251 1011 2215 5010 8048 5254
 8287 7217]
[tensor([[[[0.9843, 0.9882, 0.9961,  ..., 1.0000, 1.0000, 1.0000],
          [0.9843, 0.9882, 0.9961,  ..., 1.0000, 1.0000, 1.0000],
          [0.9843, 0.9882, 0.9961,  ..., 1.0000, 1.0000, 1.0000],
          ...,
          [0.8196, 0.7725, 0.7529,  ..., 0.7059, 0.7098, 0.7137],
          [0.8588, 0.7804, 0.7451,  ..., 0.7098, 0.7059, 0.7059],
          [0.8588, 0.7804, 0.7451,  ..., 0.7020, 0.7059, 0.7059]],

         [[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [

In [54]:
import torch.nn as nn

class FaceAutoencder(nn.Module):
    def __init__(self, latent_dim):
        super(FaceAutoencder, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),  # Maintains 92x84
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # Reduces to 46x42
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),  # Maintains 46x42
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # Reduces to 23x21
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),  # Maintains 23x21
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # Reduces to 11x10
            nn.Flatten(),
            nn.Linear(64 * 11 * 10, latent_dim),
            nn.Sigmoid()
        )
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64 * 11 * 10),
            nn.ReLU(),
            nn.Unflatten(1, (64, 11, 10)),
            nn.ConvTranspose2d(64, 32, kernel_size=3, stride=2, padding=1, output_padding=1),  # Outputs 23x21
            nn.ReLU(),
            nn.ConvTranspose2d(32, 16, kernel_size=3, stride=2, padding=1, output_padding=1),  # Outputs 46x42
            nn.ReLU(),
            nn.ConvTranspose2d(16, 3, kernel_size=3, stride=2, padding=0, output_padding=1),  # Outputs 92x84
            nn.Sigmoid()  # Ensures output values are between 0 and 1
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [58]:
a = torch.randn(25, 3, 92, 84)

In [59]:
model = FaceAutoencder(5)

In [61]:
model(a).shape

torch.Size([25, 3, 90, 82])

In [63]:
def create_fg_masks(bboxes, image_height=15, image_width=10):
    """
    Create foreground masks for a batch of images given their bounding boxes, vectorized version.
    """
    batch_size = bboxes.size(0)
    # Create coordinate grids
    x_coords = torch.arange(image_width).repeat(image_height, 1).unsqueeze(0).repeat(batch_size, 1, 1)
    y_coords = torch.arange(image_height).repeat(image_width, 1).t().unsqueeze(0).repeat(batch_size, 1, 1)

    # Get bbox coordinates and expand dimensions for broadcasting
    lefts = bboxes[:, 0].unsqueeze(1).unsqueeze(2)
    tops = bboxes[:, 1].unsqueeze(1).unsqueeze(2)
    rights = bboxes[:, 2].unsqueeze(1).unsqueeze(2)
    bottoms = bboxes[:, 3].unsqueeze(1).unsqueeze(2)

    # Create masks using logical operations
    masks = (x_coords >= lefts) & (x_coords < rights) & (y_coords >= tops) & (y_coords < bottoms)
    masks = masks.float().unsqueeze(1)  # Convert from bool to float and add channel dimension

    return masks

In [71]:
bboxs = torch.tensor([
[2, 5, 7, 7]
])
a = create_fg_masks(bboxs)

In [72]:
print(bboxs[0])
a[0]

tensor([2, 5, 7, 7])


tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 1., 1., 1., 1., 1., 0., 0., 0.],
         [0., 0., 1., 1., 1., 1., 1., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]])

In [5]:
import torch
import torch.optim as optim
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader, Subset
from model import TwoResAutoEncoder
from data import FaceDataset
import pdb




def create_fg_masks(bboxes, image_height=218, image_width=178):
    """
    Create foreground masks for a batch of images given their bounding boxes, vectorized version.
    """
    batch_size = bboxes.size(0)
    # Create coordinate grids
    x_coords = torch.arange(image_width).repeat(image_height, 1).unsqueeze(0).repeat(batch_size, 1, 1)
    y_coords = torch.arange(image_height).repeat(image_width, 1).t().unsqueeze(0).repeat(batch_size, 1, 1)

    # Get bbox coordinates and expand dimensions for broadcasting
    lefts = bboxes[:, 0].unsqueeze(1).unsqueeze(2)
    tops = bboxes[:, 1].unsqueeze(1).unsqueeze(2)
    rights = bboxes[:, 2].unsqueeze(1).unsqueeze(2)
    bottoms = bboxes[:, 3].unsqueeze(1).unsqueeze(2)

    # Create masks using logical operations
    masks = (x_coords >= lefts) & (x_coords < rights) & (y_coords >= tops) & (y_coords < bottoms)
    masks = masks.float().unsqueeze(1)  # Convert from bool to float and add channel dimension

    return masks >= 1


def train():
    device = "cuda" if torch.cuda.is_available() else "cpu"

    model = TwoResAutoEncoder(900, 100)
    model.to(torch.device(device))
    model.train()
    
    # Optimizer
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    bg_criterion = nn.MSELoss(reduction='none')
    fg_criterion = nn.MSELoss(reduction='none')
    
    num_epochs = 50
    batch_size = 25
    dataset_size = 100
    model_save_name = "boiNet.pt"

    # Initialize dataset and dataloader
    full_dataset = FaceDataset('norm_bounding_boxes.csv', '../celeba/img_align_celeba')

    # Create a subset of the full dataset (100 samples)
    indices = np.random.permutation(len(full_dataset))[:1]
    dataset = Subset(full_dataset, indices) 

    # dataset = FaceDataset('norm_bounding_boxes.csv', '../celeba/img_align_celeba')
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    for epoch in range(num_epochs):
        total_fg_loss = 0.0
        total_bg_loss = 0.0
        for images, faces, bboxs in dataloader:

            # Forward pass
            images = images.to(device)
            faces = faces.to(device)

            fg_masks = create_fg_masks(bboxs)
            fg_masks = fg_masks.to(device)

            optimizer.zero_grad()
            fg_output, bg_output = model(images, faces)
            
            breakpoint()
            # print(images.shape)
            # Calculate the loss for each region
            fg_loss = fg_criterion(fg_output, faces)
            bg_loss = bg_criterion(bg_output, images) * (~fg_masks)

            # Only consider masked areas by averaging non-zero entries
            bg_loss = bg_loss.sum() / (~fg_masks).sum()

            print(fg_loss.shape, bg_loss.shape)

            # Combine losses and perform backpropagation
            total_loss = fg_loss + bg_loss
            total_loss.backward()
            optimizer.step()

            # Aggregate losses for logging
            total_fg_loss += fg_loss.item()
            total_bg_loss += bg_loss.item()

        # Print epoch loss
        avg_fg_loss = total_fg_loss / len(dataloader)
        avg_bg_loss = total_bg_loss / len(dataloader)
        print(f'Epoch {epoch+1}/{num_epochs}, Foreground Loss: {avg_fg_loss:.4f}, Background Loss: {avg_bg_loss:.4f}')
    
    torch.save(model, model_save_name)

    


train()

torch.Size([1, 3, 92, 84]) torch.Size([])


RuntimeError: grad can be implicitly created only for scalar outputs

In [15]:
class SoftRound(nn.Module):
    def __init__(self):
        super().__init__()
        self.round = lambda x: x - torch.sin(2 * torch.pi * x) / (3 * torch.pi)

    def forward(self, x):
        x = self.round(x * 255)
        x = self.round(x)
        x = self.round(x)
        return x

class FGAutoencoder(nn.Module):
    def __init__(self, latent_dim):
        super(FGAutoencoder, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),  # Maintains 92x84
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # Reduces to 46x42
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),  # Maintains 46x42
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # Reduces to 23x21
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),  # Maintains 23x21
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # Reduces to 11x10
            nn.Flatten(),
            nn.Linear(64 * 11 * 10, latent_dim),
            nn.Sigmoid(),
            SoftRound()
        )
        # Decoder
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64 * 11 * 10),
            nn.ReLU(),
            nn.Unflatten(1, (64, 11, 10)),
            nn.ConvTranspose2d(64, 32, kernel_size=3, stride=2, padding=1, output_padding=1),  # Output should be ~22x20, adjust if needed
            nn.ReLU(),
            nn.ConvTranspose2d(32, 16, kernel_size=3, stride=2, padding=1, output_padding=1),  # Output should be ~44x40, adjust if needed
            nn.ReLU(),
            nn.ConvTranspose2d(16, 3, kernel_size=4, stride=2, padding=1),  # Adjust to output 92x84
            nn.Sigmoid()
        )    

    def forward(self, x):
        x = self.encoder(x)
        print(x)
        x = self.decoder(x)
        return 

class FGAutoencoder(nn.Module):
    def __init__(self, latent_dim):
        super(FGAutoencoder, self).__init__()

        self.encoder = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),  # Maintains 92x84
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # Reduces to 46x42
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),  # Maintains 46x42
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # Reduces to 23x21
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),  # Maintains 23x21
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # Reduces to 11x10
            nn.Flatten(),
            nn.Linear(64 * 11 * 10, latent_dim),
            nn.Sigmoid(),
            SoftRound()
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64 * 11 * 10),
            nn.ReLU(),
            nn.Unflatten(1, (64, 11, 10)),
            nn.ConvTranspose2d(64, 32, kernel_size=3, stride=2, padding=1, output_padding=1),  # Outputs 23x21
            nn.ReLU(),
            nn.ConvTranspose2d(32, 16, kernel_size=3, stride=2, padding=0, output_padding=1),  # Outputs 46x42
            nn.ReLU(),
            nn.ConvTranspose2d(16, 3, kernel_size=3, stride=2, padding=1, output_padding=1),  # Outputs 92x84
            nn.Sigmoid()  # Ensures output values are between 0 and 1
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)

        return x



class BGAutoencoder(nn.Module):
    def __init__(self, latent_dim):
        super(BGAutoencoder, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),  # Maintains 218x178
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # Reduces to 109x89
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),  # Maintains 109x89
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # Reduces to 54x44
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),  # Maintains 54x44
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # Reduces to 27x22
            nn.Flatten(),
            nn.Linear(64 * 27 * 22, latent_dim),
            nn.Sigmoid(),
            SoftRound()
        )
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64 * 27 * 22),
            nn.ReLU(),
            nn.Unflatten(1, (64, 27, 22)),
            nn.ConvTranspose2d(64, 32, kernel_size=3, stride=2, padding=1, output_padding=1),  # Outputs 54x44
            nn.ReLU(),
            nn.ConvTranspose2d(32, 16, kernel_size=3, stride=2, padding=1, output_padding=1),  # Outputs 109x89
            nn.ReLU(),
            nn.ConvTranspose2d(16, 3, kernel_size=4, stride=2, padding=0),  # Outputs 218x178
            nn.Sigmoid()  # Ensures output values are between 0 and 1
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x


class TwoResAutoEncoder(nn.Module):
    def __init__(self, high_latent_dim, low_latent_dim):
        super(TwoResAutoEncoder, self).__init__()
        self.fg_ae = FGAutoencoder(high_latent_dim)
        self.bg_ae = BGAutoencoder(low_latent_dim)

    def forward(self, image, face):
        fg_output = self.fg_ae(face)
        bg_output = self.bg_ae(image)
        
        return fg_output, bg_output

In [16]:
# model = TwoResAutoEncoder(900, 100)

# a = torch.randn(1, 3, 92, 84)
# b = torch.randn(1, 3, 218, 178)

# model(b, a)

model = FGAutoencoder(5)

a = torch.randn(1, 3, 92, 84)
b = torch.randn(1, 3, 218, 178)

print(model(a).shape)

torch.Size([1, 3, 92, 84])


(None,
 tensor([[[[0.6404, 0.4547, 0.3171,  ..., 0.3815, 0.3984, 0.2204],
           [0.5142, 0.5858, 0.0777,  ..., 0.5052, 0.3436, 0.5134],
           [0.7950, 0.8414, 0.0424,  ..., 0.1109, 0.6439, 0.5930],
           ...,
           [0.6696, 0.4838, 0.5199,  ..., 0.1338, 0.4587, 0.4222],
           [0.6025, 0.4109, 0.6645,  ..., 0.4049, 0.5599, 0.7098],
           [0.7121, 0.3255, 0.5862,  ..., 0.5307, 0.5106, 0.8395]],
 
          [[0.4357, 0.7218, 0.1015,  ..., 0.4278, 0.4488, 0.6928],
           [0.7315, 0.2358, 0.8995,  ..., 0.4823, 0.4102, 0.5676],
           [0.1885, 0.3693, 0.1622,  ..., 0.0860, 0.7474, 0.1691],
           ...,
           [0.7329, 0.3511, 0.9244,  ..., 0.4755, 0.5593, 0.1928],
           [0.5994, 0.2699, 0.6051,  ..., 0.0986, 0.5236, 0.3453],
           [0.5247, 0.4191, 0.6504,  ..., 0.3457, 0.1480, 0.2162]],
 
          [[0.4031, 0.7689, 0.7145,  ..., 0.6967, 0.7937, 0.8778],
           [0.3278, 0.4516, 0.8619,  ..., 0.5053, 0.8479, 0.7525],
           [0.863