In [111]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision import transforms
import torchvision.io as io

In [152]:
# Steps:
## Create training and validation datasets from files.
## Create data loaders for each dataset
class SDSSImagery(Dataset):
    def __init__(self, labels, transform=None):
        self.img_labels = pd.read_csv(labels, dtype={'image_filepath': 'str', 'Hubble_coarse' : 'category'})
        self.transform = transform
    
    def __len__(self):
        return len(self.img_labels)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        img_name = self.img_labels.iloc[idx, 0]
        image = io.read_image(img_name)
        image = image / 255
        label = self.img_labels.iloc[idx, 1]

        sample = {'image' : image, 'label' : label}

        if self.transform:
            sample = self.transform(sample)
        return sample

In [153]:
dataset = SDSSImagery(labels='image_labels.csv')
train_ds, val_ds = random_split(dataset, [0.8, 0.2])

batch_size = 64
train_dataloader = DataLoader(train_ds, batch_size, shuffle=True)
val_dataloader = DataLoader(val_ds, batch_size, shuffle=True)

In [104]:
image_size = 256
num_classes = 4
BasicCNN = nn.Sequential(
    nn.Conv2d(3, 16, 3, padding='same'),
    nn.ReLU(),
    nn.MaxPool2d(2),

    nn.Conv2d(16, 32, 3, padding='same'),
    nn.ReLU(),
    nn.MaxPool2d(2),

    nn.Conv2d(32, 64, 3, padding='same'),
    nn.ReLU(),
    nn.MaxPool2d(2),

    nn.Flatten(),
    nn.Linear(int((64*(image_size**2)) / (4**3)), 128),
    nn.ReLU(),
    nn.Linear(128, num_classes),
)

# Read number of classes and image size from data?

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(BasicCNN.parameters(), lr=0.001, momentum=0.9)

In [154]:
for epoch in range(2):
    running_loss = 0.0
    for i, data in enumerate(train_dataloader, 0):
        inputs = data['image']
        labels = torch.tensor(data['label'])
        optimizer.zero_grad()
        outputs = BasicCNN(inputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 5 == 4:
            print(f'[{epoch+1},{i + 1:5d}] loss: {running_loss / 5:.3f}')
            running_loss = 0.0

print('Finished Training')

ValueError: too many dimensions 'str'

In [162]:
codes, uniques = pd.factorize(dataset.img_labels['Hubble_coarse'])
uniques

CategoricalIndex(['spiral', 'lenticular', 'elliptical', 'irregular'], categories=['elliptical', 'irregular', 'lenticular', 'spiral'], ordered=False, dtype='category')