<a href="https://colab.research.google.com/github/jeffreyfeng99/SYDE_522_A3/blob/master/extract_data_parameters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import pandas as pd
import numpy as np
import random
from PIL import Image
from tqdm import tqdm
from datetime import datetime

import torch
import torch.nn as nn
import torch.utils.data as data
from torch.autograd import Function
import torch.backends.cudnn as cudnn
from torchvision import transforms
from torchvision import datasets
from torchvision import models
import torch.optim as optim

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
BATCH_SIZE = 128
IMAGE_SIZE = 224 #227

In [4]:
dataset_root = '/content/drive/MyDrive/4B/SYDE-522/data'
output_root = '/content/drive/MyDrive/4B/SYDE-522/submission/03292022_resnetnorm'
source_dataset_name = 'train_set'
target_dataset_name = 'test_set'

source_image_root = os.path.join(dataset_root, source_dataset_name)
target_image_root = os.path.join(dataset_root, target_dataset_name)

train_label_list = os.path.join(dataset_root, 'train_labels.csv')

os.makedirs(output_root, exist_ok=True)

In [5]:
class GetLoader(data.Dataset):
    def __init__(self, data_root, data_list=None, transform=None):
        self.root = data_root
        self.transform = transform

        # we only pass data_list if it's training set
        if data_list is not None:
            df = pd.read_csv(data_list)
            self.img_paths = df['dir'].to_list()

            if 'label2' in df.columns:
                self.img_labels = df['label2'].to_list()
            else: 
                self.img_labels = ['0' for i in range(len(self.img_paths))]

            if 'label1' in df.columns:
                self.domain_labels = df['label1'].to_list()
            else: 
                self.domain_labels = ['0' for i in range(len(self.img_paths))]
        else:
            # Walk through test folder - we don't need labels
            self.img_paths = [f for root,dirs,files in os.walk(data_root) for f in files if f.endswith('.png')]
            self.img_labels = ['0' for i in range(len(self.img_paths))]
            self.domain_labels = ['0' for i in range(len(self.img_paths))]

        self.n_data = len(self.img_paths)

    def __getitem__(self, item):
        img_paths, labels, domain_labels = self.img_paths[item%self.n_data], self.img_labels[item%self.n_data], self.domain_labels[item%self.n_data]
        imgs = Image.open(os.path.join(self.root, img_paths)).convert('RGB')

        if self.transform is not None:

            if isinstance(self.transform, list):
                tform = self.transform[int(domain_labels)]
            else:
                tform = self.transform

            imgs = tform(imgs)
            labels = int(labels)
            domain_labels = int(domain_labels)

        return imgs, labels, domain_labels, img_paths

    def __len__(self):
        return self.n_data

In [6]:
img_transform = transforms.Compose([
    transforms.Resize(IMAGE_SIZE),
    transforms.ToTensor()
])

train_dataset = GetLoader(
    data_root=os.path.join(source_image_root, 'train_set'),
    data_list=train_label_list,
    transform=img_transform
)

train_dataloader = data.DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=2,
    drop_last=False)
    
test_dataset = GetLoader(
    data_root=os.path.join(target_image_root, 'test_set'),
    transform=img_transform
)

test_dataloader = data.DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=2,
    drop_last=False)

train_mean = 0.0
domain0_mean = 0.0
domain1_mean = 0.0
domain2_mean = 0.0
domain0_count = 0
domain1_count = 0
domain2_count = 0
for images, _, domain_label, _ in tqdm(train_dataloader):
    batch_samples = images.size(0) 
    images = images.view(batch_samples, images.size(1), -1)
    train_mean += images.mean(2).sum(0)

    domain0_count += (domain_label == 0).nonzero(as_tuple=True)[0].shape[0]
    domain1_count += (domain_label == 1).nonzero(as_tuple=True)[0].shape[0]
    domain2_count += (domain_label == 2).nonzero(as_tuple=True)[0].shape[0]
    domain0_mean += torch.index_select(images, 0, (domain_label == 0).nonzero(as_tuple=True)[0]).mean(2).sum(0)
    domain1_mean += torch.index_select(images, 0, (domain_label == 1).nonzero(as_tuple=True)[0]).mean(2).sum(0)
    domain2_mean += torch.index_select(images, 0, (domain_label == 2).nonzero(as_tuple=True)[0]).mean(2).sum(0)

mean = train_mean / len(train_dataloader.dataset)
domain0_mean = domain0_mean / domain0_count
domain1_mean = domain1_mean / domain1_count
domain2_mean = domain2_mean / domain2_count

var = 0.0
domain0_var = 0.0
domain1_var = 0.0
domain2_var = 0.0
for images, _, _,_ in tqdm(train_dataloader):
    batch_samples = images.size(0)
    images = images.view(batch_samples, images.size(1), -1)

    var += ((images - mean.unsqueeze(1))**2).sum([0,2])

    domain0_var += ((torch.index_select(images, 0, (domain_label == 0).nonzero(as_tuple=True)[0]) - domain0_mean.unsqueeze(1))**2).sum([0,2]) 
    domain1_var += ((torch.index_select(images, 0, (domain_label == 1).nonzero(as_tuple=True)[0]) - domain1_mean.unsqueeze(1))**2).sum([0,2]) 
    domain2_var += ((torch.index_select(images, 0, (domain_label == 2).nonzero(as_tuple=True)[0]) - domain2_mean.unsqueeze(1))**2).sum([0,2]) 

std = torch.sqrt(var / (len(train_dataloader.dataset)*224*224))
domain0_std = torch.sqrt(domain0_var / (domain0_count*224*224))
domain1_std = torch.sqrt(domain1_var / (domain1_count*224*224))
domain2_std = torch.sqrt(domain2_var / (domain2_count*224*224))

print('Train size: ', len(train_dataloader.dataset))
print('Num Domain 0: ', domain0_count)
print('Num Domain 1: ', domain1_count)
print('Num Domain 2: ', domain2_count)

print('Global train mean: ', mean)
print('Global train std: ', std)
print('Domain 0 mean: ', domain0_mean)
print('Domain 0 std: ', domain0_std)
print('Domain 1 mean: ', domain1_mean)
print('Domain 1 std: ', domain1_std)
print('Domain 2 mean: ', domain2_mean)
print('Domain 2 std: ', domain2_std)

domain3_mean = 0.0
for images, _, domain_label, _ in tqdm(test_dataloader):
    batch_samples = images.size(0) 
    images = images.view(batch_samples, images.size(1), -1)
    domain3_mean += images.mean(2).sum(0)
    train_mean += images.mean(2).sum(0)

domain3_mean = domain3_mean / len(test_dataloader.dataset)
global_mean = train_mean/(len(train_dataloader.dataset)+len(test_dataloader.dataset))

global_var = 0.0
for images, _, _,_ in tqdm(train_dataloader):
    batch_samples = images.size(0)
    images = images.view(batch_samples, images.size(1), -1)

    global_var += ((images - global_mean.unsqueeze(1))**2).sum([0,2])

domain3_var = 0.0
for images, _, _,_ in tqdm(test_dataloader):
    batch_samples = images.size(0)
    images = images.view(batch_samples, images.size(1), -1)

    domain3_var += ((images - domain3_mean.unsqueeze(1))**2).sum([0,2])
    global_var += ((images - global_mean.unsqueeze(1))**2).sum([0,2])

domain3_std = torch.sqrt(domain3_var / (len(test_dataloader.dataset)*224*224))
global_std = torch.sqrt(global_var / ((len(test_dataloader.dataset)+len(train_dataloader.dataset))*224*224))

print('Test size: ', len(test_dataloader.dataset))
print('Domain 3 mean: ', domain3_mean)
print('Domain 3 std: ', domain3_std)

print('Global Mean: ', global_mean)
print('Global std: ', global_std)

100%|██████████| 48/48 [10:43<00:00, 13.40s/it]
100%|██████████| 48/48 [00:30<00:00,  1.59it/s]


Train size:  6062
Num Domain 0:  1670
Num Domain 1:  2048
Num Domain 2:  2344
Global train mean:  tensor([0.6399, 0.6076, 0.5603])
Global train std:  tensor([0.3065, 0.3082, 0.3353])
Domain 0 mean:  tensor([0.5085, 0.4832, 0.4396])
Domain 0 std:  tensor([0.1780, 0.1779, 0.1907])
Domain 1 mean:  tensor([0.5550, 0.5085, 0.4579])
Domain 1 std:  tensor([0.1880, 0.1917, 0.2060])
Domain 2 mean:  tensor([0.8077, 0.7829, 0.7358])
Domain 2 std:  tensor([0.2239, 0.2283, 0.2437])


100%|██████████| 31/31 [07:43<00:00, 14.95s/it]
100%|██████████| 48/48 [00:29<00:00,  1.62it/s]
100%|██████████| 31/31 [00:19<00:00,  1.58it/s]

Test size:  3929
Domain 3 mean:  tensor([0.9566, 0.9566, 0.9566])
Domain 3 std:  tensor([0.1752, 0.1752, 0.1752])
Global Mean:  tensor([0.7645, 0.7449, 0.7162])
Global std:  tensor([0.3050, 0.3143, 0.3432])





Train size:  6062

Num Domain 0:  1670

Num Domain 1:  2048

Num Domain 2:  2344

Global train mean:  tensor([0.6399, 0.6076, 0.5603])

Global train std:  tensor([0.3065, 0.3082, 0.3353])

Domain 0 mean:  tensor([0.5085, 0.4832, 0.4396])

Domain 0 std:  tensor([0.1780, 0.1779, 0.1907])

Domain 1 mean:  tensor([0.5550, 0.5085, 0.4579])

Domain 1 std:  tensor([0.1880, 0.1917, 0.2060])

Domain 2 mean:  tensor([0.8077, 0.7829, 0.7358])

Domain 2 std:  tensor([0.2239, 0.2283, 0.2437])

Test size:  3929

Domain 3 mean:  tensor([0.9566, 0.9566, 0.9566])

Domain 3 std:  tensor([0.1752, 0.1752, 0.1752])

Global Mean:  tensor([0.7645, 0.7449, 0.7162])

Global std:  tensor([0.3050, 0.3143, 0.3432])