In [9]:
import os 
import numpy as np
import pandas as pd
import cv2

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as utils
import torch.optim as optim
from torchvision import transforms
import torchvision.transforms.functional as F

In [10]:
# Loading data
train_df = pd.read_csv("Data/train.csv")
test_df = pd.read_csv("Data/test.csv")

# Drop unnecessary columns
train_df = train_df.drop(['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory', 'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur'],axis=1)
test_df = test_df.drop(['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory', 'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur'],axis=1)

# add image path to new column
train_df["im_path"] = "Data/train/" + train_df['Id'] + ".jpg"
test_df["im_path"] = "Data/train/" + test_df['Id'] + ".jpg"

I resize the images to 128 x 128 images for consistent, easier training. I augmented the dataset through grayscaling, blurring, and random flipping and perpestive.

In [11]:
# Augmented data by transforming to grayscale, resizing, random flipping vertically and horizontally, handling the distortion of image, and transforming random perspective
def augment_img(image, new_image_size):
    transpose = transforms.Compose([
                transforms.ToTensor(),
                transforms.Grayscale(),
                transforms.Resize(new_image_size),
                transforms.RandomHorizontalFlip( p = 0.5),
                transforms.RandomVerticalFlip(p = 0.5),
                transforms.GaussianBlur((3,3)),
                transforms.RandomPerspective(distortion_scale=0.5, p=0.5)])
    image = transpose(image)
    return image

In [4]:
new_image_size = (128, 128)
  
A, B = [], []
for i in range(len(train_df)):
    image = cv2.imread(train_df["im_path"][i])
    paw_score = train_df["Pawpularity"][i]
    image = augment_img(image, new_image_size)
    A.append(image)
    B.append(torch.from_numpy(np.array(paw_score)))

A, B = torch.stack(A), torch.stack(B)

# Model Architecture:
I use 6 layers for my CNN architecture:
- Convolution Layers for extracting the features from the images.
- Relu activation: increase the non-linearity in our images and since the image pixel is bigger than 0, keep pixels carrying a positive value.
- Max Pooling layers: reduce parameters counts and computational complexity; prevent overfitting problems for the model.
- Dropout Layers
- Set of fully connected layers

In [5]:
# define the CNN architecture
class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.layer = nn.Sequential(
            nn.Conv2d(1,32,5,3), # size 42x42x32
            nn.ReLU(),
            nn.Conv2d(32,64,3),  # size 40x40x64
            nn.ReLU(),
            nn.BatchNorm2d(64), 
            nn.Conv2d(64,128,3), # size 38x38x128
            nn.ReLU(),
            nn.BatchNorm2d(128),
            nn.MaxPool2d(2,2), # size 19x19x128
            nn.Dropout2d(p = 0.1), # size 19x19x128
            nn.ReLU(),
            nn.Conv2d(128,256,3), # size 17x17x256
            nn.ReLU(),
            nn.BatchNorm2d(256),
            nn.Dropout2d(p = 0.1),
            nn.Conv2d(256,512,3), # size 15x15x512
            nn.ReLU(),
            nn.Conv2d(512,1024,3,2), # size 7x7x1024
            nn.BatchNorm2d(1024),
            nn.Flatten(), # size (50176)
            nn.Linear(50176, 50), 
            nn.Dropout(p = 0.5),
            nn.ReLU(),
            nn.Linear(50,1)
        )
    def forward(self, x):
        return self.layer(x)

In [6]:
class RMSELoss(torch.nn.Module):
    def __init__(self):
        super(RMSELoss,self).__init__()

    def forward(self,x,y):
        criterion = nn.MSELoss()
        loss = torch.sqrt(criterion(x.float(), y.float()))
        return loss

def train(model, device, train_loader, optimizer, epoch):
    RMSE = RMSELoss()
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        target = target.unsqueeze(1)
        RMSE_loss = torch.sqrt(nn.MSELoss()(output.squeeze(0).float(), target.float()))
        RMSE_loss.backward()
        optimizer.step()
        if batch_idx == 0: #Print loss for the first batch
            print('Train Epoch: {}\tRMSE_train: {:.6f}'.format(
                epoch,  RMSE_loss.item()))
                
def validate(model, device, test_loader):
    model.eval()
    RMSE = RMSELoss()
    loss = 0 
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss += RMSE.forward(output.squeeze(0),target)
    print('RMSE_val: {:.6f}'.format((loss/len(test_loader.dataset))))

In [7]:
learning_rate = 0.0001
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
network = Network().to(device)
optimizer = optim.Adam(network.parameters(),lr=learning_rate)

In [12]:
batch_size = 32
epochs = 35
dataset = utils.TensorDataset(A, B)
train_dataset,test_dataset = utils.random_split(dataset, lengths  =[round(len(A) * 0.8),len(A) - round(len(A) * 0.8)], generator=torch.Generator().manual_seed(16))
train_loader = utils.DataLoader(train_dataset,shuffle = True, batch_size = batch_size)
test_loader = utils.DataLoader(test_dataset,shuffle = False)
for epoch in range(epochs):
        train(network, device, train_loader, optimizer, epoch)
torch.save(network.state_dict(), "PET.pt")

Train Epoch: 0	RMSE_train: 36.597328
Train Epoch: 1	RMSE_train: 18.820772
Train Epoch: 2	RMSE_train: 19.676891
Train Epoch: 3	RMSE_train: 18.396284
Train Epoch: 4	RMSE_train: 24.029833
Train Epoch: 5	RMSE_train: 24.693430
Train Epoch: 6	RMSE_train: 20.745470
Train Epoch: 7	RMSE_train: 24.628674
Train Epoch: 8	RMSE_train: 26.161039
Train Epoch: 9	RMSE_train: 23.384872
Train Epoch: 10	RMSE_train: 26.452007
Train Epoch: 11	RMSE_train: 17.245005
Train Epoch: 12	RMSE_train: 20.148777
Train Epoch: 13	RMSE_train: 22.859543
Train Epoch: 14	RMSE_train: 19.329481
Train Epoch: 15	RMSE_train: 19.812038
Train Epoch: 16	RMSE_train: 20.769924
Train Epoch: 17	RMSE_train: 21.715115
Train Epoch: 18	RMSE_train: 22.511499
Train Epoch: 19	RMSE_train: 22.407368
Train Epoch: 20	RMSE_train: 18.071106
Train Epoch: 21	RMSE_train: 17.951204
Train Epoch: 22	RMSE_train: 20.234882
Train Epoch: 23	RMSE_train: 18.093975
Train Epoch: 24	RMSE_train: 13.387535
Train Epoch: 25	RMSE_train: 16.562382
Train Epoch: 26	RMSE_t

In [13]:
validate(network,device,test_loader)

RMSE_val: 17.619154


Interestingly, I received a RMSE score of 17.619, which is in the top 10 of the Kaggle Leaderboard. My solution could be adapted into AI tools that will guide shelters and rescuers around the world to improve the appeal of their pet profiles, automatically enhancing photo quality and recommending composition improvements.

In [8]:
cnn = pd.DataFrame()
cnn['Id'] = test_df['Id']
test = []
for index in range(len(test_df["im_path"])):
    image = cv2.imread(train_df["im_path"][i])
    image = augment_img(image, new_image_size)
    image = image.to(device)
    score = network(image[None,:,:,:]).cpu().detach().numpy()
    test. append(score[0,0])
cnn['Pawpularity'] = test
cnn.to_csv('submission.csv',index=False)