In [2]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torchvision
from torchvision import transforms as T
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder

In [3]:
class ImageNetDataset(Dataset):
    def __init__(
        self,
        annotations_file,
        img_dir,
        transform=None,
        target_transform=None,
        mode="train",
    ):
        self.df = pd.read_csv(annotations_file)
        self.labelNums = np.array(self.getLabels()[0]).astype(np.uint8)
        self.labelNames = self.getLabels()[1]
        self.img_names = self.getImgNames()
        self.img_dir = img_dir
        self.transform = transform
        self.target_transform = target_transform
        self.mode = mode

    def __len__(self):
        return len(self.labelNums)

    def __getitem__(self, idx):
        labelNum = self.labelNums[idx]
        labelName = self.labelNames[idx]
        if self.mode == "train":
            img_path = (
                os.path.join(self.img_dir, labelName, self.img_names[idx]) + ".JPEG"
            )
        else:
            img_path = os.path.join(self.img_dir, self.img_names[idx]) + ".JPEG"
        img = Image.open(img_path).convert("RGB")
        img = np.array(img)
        if self.transform:
            img = self.transform(img)
        if self.target_transform:
            labelNum = self.target_transform(labelNum)
        return img, labelNum

    def getLabels(self):
        labelNames = [
            row.split(" ")[0] for row in self.df["PredictionString"].values.tolist()
        ]
        le = LabelEncoder()
        labelNums = le.fit_transform(labelNames).tolist()
        lableMap = dict(zip(le.classes_, range(len(le.classes_))))
        return labelNums, labelNames, lableMap

    def getImgNames(self):
        return self.df["ImageId"].values.tolist()

In [4]:
normalize = T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
train_transform = T.Compose(
        [
            T.ToTensor(),
            T.CenterCrop(256),
            # T.RandomResizedCrop(img_size),
            T.RandomHorizontalFlip(),
            T.TenCrop(224),
            T.Lambda(
                lambda crops: torch.stack([normalize(crop) for crop in crops])
            ),
        ]
    )

In [5]:
rootDir = "../../../datasets/ImageNet/"
train_dataset = ImageNetDataset(
    annotations_file=os.path.join(rootDir, "LOC_train_solution.csv"),
    img_dir=os.path.join(rootDir, "ILSVRC/Data/CLS-LOC/train"),
    transform=train_transform,
    mode="train",
)

In [6]:
len(train_dataset[0][0])

10

In [7]:
train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=16,
        shuffle=True,
        num_workers=2,
        pin_memory=True,
    )

In [8]:
next(iter(train_loader))[0].shape

torch.Size([16, 10, 3, 224, 224])

In [9]:
n_tta = next(iter(train_loader))[0].size()[1]

In [10]:
next(iter(train_loader))[0][:,0,:,:,:].shape

torch.Size([16, 3, 224, 224])

In [11]:
from models.alexnet import AlexNet
model = AlexNet(
            num_classes=3,
            dropout=0.1,
        )

In [22]:
inputs = next(iter(train_loader))[0]
outputs_sum = torch.zeros(model(torch.select(inputs, 1, 0)).size())
outputs_list = []
for i in range(n_tta):
    inputs_crop = torch.select(inputs, 1, i)
    outputs = model(inputs_crop).to("cpu")
    # print(outputs_sum.shape, outputs.shape)
    outputs_sum += outputs
    print(round(outputs.mean().item(), 6))
    outputs_list.append(outputs.mean().item())
    
outputs_mean = outputs_sum / n_tta
# print(outputs_mean.size())
print(round(outputs_mean.mean().item(), 6))
print(round(sum(outputs_list) / len(outputs_list), 6))

0.008453
0.008357
0.008895
0.008668
0.008383
0.008944
0.008478
0.00845
0.008827
0.008963
0.008642
0.008642


0.008534