<a href="https://colab.research.google.com/github/hemanth346/mde_bs/blob/master/notebooks/Cal_DataStats.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
import cv2
import numpy as np
from pathlib import Path
import zipfile
from PIL import Image
import io
import matplotlib.pyplot as plt

In [0]:
root_dir = '/content/drive/My Drive/DL/mask_depthmap_data/Compressed_Dataset'

bg_dir = Path(root_dir+'/bg')
fg_dir = Path(root_dir+'/fg')
fg_masks_dir = Path(root_dir+'/fg_masks')
fg_bg_dir = Path(root_dir+'/fg_bg')
mask_dir = Path(root_dir+'/fg_bg_masks')
depth_dir = Path(root_dir+'/depth_maps')

In [0]:
fg_bg_dir, mask_dir, depth_dir

(WindowsPath('/content/drive/My Drive/DL/mask_depthmap_data/Compressed_Dataset/fg_bg'),
 WindowsPath('/content/drive/My Drive/DL/mask_depthmap_data/Compressed_Dataset/fg_bg_masks'),
 WindowsPath('/content/drive/My Drive/DL/mask_depthmap_data/Compressed_Dataset/depth_maps'))

In [0]:
import torch
import torchvision
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import transforms

In [0]:
class FolderData(Dataset):
    def __init__(self, data_dir, suffix='', size=224):
        pattern = '*'
        if suffix:
            pattern = '*.'+suffix
        self.files = list(data_dir.rglob(pattern))
        self.transform = transforms.Compose([
            transforms.Resize((size, size)),
            transforms.ToTensor()
        ])
    def __len__(self):
        return len(self.files)
    
    def __getitem__(self, index):
        img = Image.open(self.files[index])
        img = img.convert("RGB")
        return self.transform(img)

In [0]:
class ZipData(Dataset):
    def __init__(self, data_dir, size=224):
        self.paths = []
        for file in os.listdir(data_dir):
            fname = os.path.join(Path(data_dir, file))
            if zipfile.is_zipfile(fname):
                self.paths += [x.filename for x in zipfile.ZipFile(fname).infolist()]
        self.data_dir = data_dir
        self.transform = transforms.Compose([
            transforms.Resize((size, size)),
            transforms.ToTensor()
        ])

    def read_img_from_zip(self, zip_name, file_name, array=True):
        imgdata = zipfile.ZipFile(zip_name).read(file_name)
        img = Image.open(io.BytesIO(imgdata))
        # img = img.convert("RGB")
        if array:
            img = np.array(img)
            return img
        # PIL image
        return img 
    
    def __len__(self):
        return len(self.paths)
    
    def __getitem__(self, index):
        bg = self.paths[index].split('_')[0]
        img = self.read_img_from_zip(os.path.join(self.data_dir, bg+'.zip'), self.paths[index], array=False) 
        return self.transform(img)

In [0]:
# even though we are iterating two times the values will be accurate here
# and since this is one time process, the computation can be ignored

# Other approach is to calculate the standard deviation by averaging samples of the sd from mini batches. 
# While very close to the true sd, it’s not calculated exactly and can be leveraged if time/computation limitations

def get_mean(dataset, batch_size=50):
    mean = 0.0
    loader = DataLoader(dataset, 
                      batch_size=batch_size, 
                      shuffle=True)
    for images in loader:
        batch_size = images.size(0) 
        images = images.view(batch_size, images.size(1), -1)
        mean += images.mean(2).sum(0)
    mean = mean / len(loader.dataset)
    return mean

def get_std(dataset, mean, batch_size=50):
    var = 0.0
    loader = DataLoader(dataset, 
                      batch_size=batch_size, 
                      shuffle=True)
    for images in loader:
        batch_samples = images.size(0)
        # convert into 3 flattened channels
        images = images.view(batch_samples, images.size(1), -1)
        # take mean for each of these channels, substract from image channels
        # square them and add across channels to get variance
        var += ((images - mean.unsqueeze(1))**2).sum([0,2])
    # square root over total pixels
    std = torch.sqrt(var / (len(dataset)*dataset[0].shape[1]*dataset[0].shape[2]))
    return std

In [0]:
stats = open('datastats.txt', 'w')

In [0]:
# jp = [x for x in os.listdir(bg_dir) if not x.endswith('jpg')]
# for f in jp:
#     img = Image.open(os.path.join(bg_dir, f))
#     img = img.convert("RGB")
#     print(f, f.split('.')[0]+'.jpg')
#     img.save(os.path.join(bg_dir, f.split('.')[0]+'.jpg'))

office63.jpeg office63.jpg
office69.gif office69.jpg
office70.png office70.jpg
office75.png office75.jpg
office76.png office76.jpg
office80.jpeg office80.jpg
office89.jpeg office89.jpg
office91.jpeg office91.jpg


In [0]:
# [os.remove(os.path.join(bg_dir, x)) for x in os.listdir(bg_dir) if not x.endswith('jpg')]
# [x for x in os.listdir(bg_dir) if not x.endswith('jpg')]

[]

In [0]:
stats.write('mean\t\tstd')

In [0]:
dataset = FolderData(bg_dir, suffix='jpg')
mean = get_mean(dataset)
std = get_std(dataset, mean)
stats.write(f'BG : mean - {mean} ; std - {std}')
print(mean, std)

tensor([0.5868, 0.5753, 0.5547]) tensor([0.2566, 0.2563, 0.2631])


In [0]:
dataset = FolderData(fg_dir, suffix='png')
mean = get_mean(dataset)
std = get_std(dataset, mean)
print(mean, std)
stats.write(f'FG : mean - {mean} ; std - {std}')

tensor([0.4965, 0.4817, 0.4846]) tensor([0.4152, 0.4059, 0.4015])


85

In [0]:
dataset = FolderData(fg_masks_dir, suffix='png')
mean = get_mean(dataset)
std = get_std(dataset, mean)
print(mean, std)
stats.write(f'FG masks: mean - {mean} ; std - {std}')

tensor([0.4190, 0.4190, 0.4190]) tensor([0.4883, 0.4883, 0.4883])


90

In [0]:
dataset = ZipData(data_dir=fg_bg_dir)

mean = get_mean(dataset)
std = get_std(dataset, mean)
print(mean, std)
stats.write(f'FG_BG : mean - {mean} ; std - {std}')

tensor([0.6045, 0.5874, 0.5730]) tensor([0.2815, 0.2813, 0.2814])


88

In [0]:
dataset = ZipData(data_dir=mask_dir)

mean = get_mean(dataset)
std = get_std(dataset, mean)
print(mean, std)
stats.write(f'FG_BG_Masks : mean - {mean} ; std - {std}')

tensor([0.0563]) tensor([0.2252])


62

In [0]:
dataset = ZipData(data_dir=mask_dir)

mean = get_mean(dataset)
std = get_std(dataset, mean)
print(mean, std)
stats.write(f'Depth_maps : mean - {mean} ; std - {std}')

tensor([0.3743]) tensor([0.1962])


61

In [0]:
stats.close()