# Exploring the labels

In [8]:
from collections import defaultdict

import numpy as np
import torch
from torch.utils.data import SubsetRandomSampler, DataLoader
from torchvision import transforms

np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7fcf53fc1db0>

In [2]:
import sys, os
sys.path.append(os.path.join(os.getcwd(), 'firepunks'))

from firepunks import datasets as DS

# Using Fire Punks Dataset

In [3]:
punks_labels = DS.load_labels(DS.PUNK_LABELS)

train_idx, test_idx = DS.split_labels(punks_labels, 1000)
print(len(train_idx))
print(len(test_idx))

9000
1000


We'll create DataLoaders for the firepunks dataset with a transform to convert the PIL images into scaled pytorch tensors.

In [9]:
punk_transforms = transforms.Compose([
    transforms.ToTensor(),
])

punks_ds = DS.FirePunksDataset(DS.ALL_LABELS, test_size=2000, transform=punk_transforms)
train_sampler = SubsetRandomSampler(punks_ds.train_idx)
test_sampler = SubsetRandomSampler(punks_ds.test_idx)

batch_size = 32

train_loader = DataLoader(
    dataset=punks_ds, batch_size=batch_size, shuffle=False, sampler=train_sampler
)
test_loader = DataLoader(
    dataset=punks_ds, batch_size=batch_size, shuffle=False, sampler=test_sampler
)

Using batch size 32 with 2000 test items means the last batch is size 16 instead of 32.

In [11]:
print(len(train_loader))                     # 250
print(len(iter(train_loader)) * batch_size)  # 8000

print(len(test_loader))                      # 63
print(len(iter(test_loader)) * batch_size)   # 2016


ps = defaultdict(int)
for idx, (punk, label) in enumerate(train_loader):
    ps[idx] = punk

# last batch is 32
print(len(punk))


ps = defaultdict(int)
for idx, (punk, label) in enumerate(test_loader):
    ps[idx] = punk

# last batch is 16, giving total of 2000 items
print(len(punk))

250
8000
63
2016
32
16


Different test sizes works fine with the firepunks dataset

In [12]:
punks_ds = DS.FirePunksDataset(DS.ALL_LABELS, test_size=1000, transform=punk_transforms)
print(len(punks_ds.train_idx))
print(len(punks_ds.test_idx))

punks_ds = DS.FirePunksDataset(DS.ALL_LABELS, test_size=0, transform=punk_transforms)
print(len(punks_ds.train_idx))
print(len(punks_ds.test_idx))

9000
1000
10000
0


# Using CPunks Dataset

We do the same thing as above, but with the legacy format from cpunks

In [None]:
punks_df = DS.load_labels_df(DS.PUNK_LABELS)
train_idx, test_idx = DS.split_df(punks_df, 1000)
print(len(train_idx))
print(len(test_idx))

In [None]:
punks_ds = DS.CPunksDataset(DS.ALL_LABELS, test_size=2000)
train_sampler = SubsetRandomSampler(punks_ds.train_idx)
test_sampler = SubsetRandomSampler(punks_ds.test_idx)


batch_size = 32

train_loader = DataLoader(
    dataset=punks_ds, batch_size=batch_size, shuffle=False, sampler=train_sampler
)
test_loader = DataLoader(
    dataset=punks_ds, batch_size=batch_size, shuffle=False, sampler=test_sampler
)


In [None]:
print(len(train_loader))                     # 250
print(len(iter(train_loader)) * batch_size)  # 8000

print(len(test_loader))                      # 63
print(len(iter(test_loader)) * batch_size)   # 2016


ps = defaultdict(int)
for idx, (punk, label) in enumerate(train_loader):
    ps[idx] = punk

# last batch is 32
print(len(punk))


ps = defaultdict(int)
for idx, (punk, label) in enumerate(test_loader):
    ps[idx] = punk

# last batch is 16, giving total of 2000 items
print(len(punk))


In [None]:
punks_ds = DS.CPunksDataset(DS.ALL_LABELS, test_size=1000)
print(len(punks_ds.train_idx))
print(len(punks_ds.test_idx))

punks_ds = DS.CPunksDataset(DS.ALL_LABELS, test_size=0)
print(len(punks_ds.train_idx))
print(len(punks_ds.test_idx))