### Contents
Loading the Images<br>
Understanding the Samples<br>
Exploring the Training Set<br>
From Images to Numpy Arrays<br>
From Numpy Arrays to Tensors<br>


In [17]:
import matplotlib
import numpy as np
import pandas as pd
from PIL.Image import Image
import torch
from torch import tensor
from torchvision import transforms
from torchvision.datasets import MNIST

In [18]:
matplotlib.rc('image', cmap='Greys')

# configure pandas to print full data sets
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 1000)
np.set_printoptions(linewidth=200)
torch.set_printoptions(linewidth=200)


### Loading the Images

In [37]:
# Define a transform to normalize the data
transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((0.5,), (0.5,))])

# Download and load the training and test samples.
train = MNIST('./mnist_data/', download=True, train=True)
test = MNIST('./mnist_data/', download=True, train=False)

print(type(train))
print('Number of samples in the training set', len(train))
print('Number of samples in the test set', len(test))

<class 'torchvision.datasets.mnist.MNIST'>
Number of samples in the training set 60000
Number of samples in the test set 10000


### Understanding the Samples

In [None]:

# Get a single sample.
sample = train[0]

print('Sample type:', type(sample))
print('Sample length:', len(sample))
print('Image type:', type(sample[0]))
print('Label type:', type(sample[1]))

In [None]:
print('The image below is a:', sample[1])
sample[0]

In [None]:
import sys

# TODO: This produces odd results. Need to understand this better.
print(sys.getsizeof(train))
print(sys.getsizeof(test))

### Exploring the Training Set

In [None]:
def filter_samples_by_label(samples: MNIST, label: int) -> list:
    '''
    The samples parameter will usually be either the training set or the test set.
    label must be an integer between 0 and 9.
    '''
    matches = []
    for sample in samples:
        if sample[1] == label:
            matches.append(sample)
    return matches

In [None]:
fives = filter_samples_by_label(train, 5)

print('Type:', type(fives))
print('Number of fives:', len(fives))

In [None]:
def create_banner_image(samples: list) -> Image:
    '''
    Creates a banner image from a list of MNIST samples.
    '''
    banner = None
    for sample in samples:
        
        # We need to get the image from the MNIST sample.
        sample_image = sample[0]

        if banner is None:  # First sample.
            banner = sample_image
        else:
            # Existing banner width plus the width of the new sample image.
            new_width = banner.size[0] + sample_image.size[0]

            # Height should always be equal.
            new_height = max(banner.size[1], sample_image.size[1])

            # Initialize the new banner.
            new_banner = Image.new('RGBA', (new_width, new_height)) 
            
            #Paste in the existing banner.
            new_banner.paste(banner)

            # Paste the current sample image to the end of the new banner.
            new_banner.paste(sample_image, (banner.size[0], 0))  

            banner = new_banner

    return banner

In [None]:
banner = create_banner_image(fives[0:10])
banner

### From Images to a Numpy arrays

In [31]:
# Get a single sample from the training set and then get its image.
# The first item in the tuple is the image.
sample = train[0]
image = sample[0]

image_array = np.array(image)

print('Image type:', type(image))
print('Array type:', type(image_array))
print(image_array[0:6,0:20])
#print(image_array)

Image type: <class 'PIL.Image.Image'>
Array type: <class 'numpy.ndarray'>
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   3  18  18  18 126 136 175  26]]


In [52]:
# List comprehension to load all images into an array of tensors.
train_list = [np.array(sample[0]) for sample in train]
test_list = [np.array(sample[0]) for sample in test]

print('Type of training list:', type(train_list))
print('Length of training list:', len(train_list))
print('Length of test list:', len(test_list))

Type of training list: <class 'list'>
Length of training list: 60000
Length of test list: 10000


In [49]:
# This is slow. Pytorch will actually throw a warning if you pass it a large Python list.
train_tensor = torch.tensor(train_list)
test_tensor = torch.tensor(test_list)

print('Type of training tensor:', type(train_tensor))
print('ndim (or Rank):', train_tensor.ndim)
print('Shape:', train_tensor.shape)

Type of training tensor: <class 'torch.Tensor'>
ndim (or Rank): 3
Shape: torch.Size([60000, 28, 28])


In [46]:
# Converting to a Numpy array is faster but it creates a new copy in memory.
train_tensors = torch.tensor(np.array(train_list))
test_tensors = torch.tensor(np.array(test_list))

print('Type of training tensor:', type(train_tensor))
print('ndim (or Rank):', train_tensor.ndim)
print('Shape:', train_tensor.shape)

In [53]:
# List comprehension to load all images into an array of tensors.
train_list = [torch.tensor(np.array(sample[0])) for sample in train]
test_list = [torch.tensor(np.array(sample[0])) for sample in test]

In [54]:
# Using torch.stack()
train_tensors = torch.stack(train_list)
test_tensors = torch.stack(test_list)

print('Type of training tensor:', type(train_tensor))
print('ndim (or Rank):', train_tensor.ndim)
print('Shape:', train_tensor.shape)

Type of training tensor: <class 'torch.Tensor'>
ndim (or Rank): 3
Shape: torch.Size([60000, 28, 28])


In [50]:
import numpy as np

data_list = np.array([0, 1, 2, 3, 4, 5])

# Using np.array() creates another copy.
data_copy = np.array(data_list)

# Using np.asarray() creates a variable that shares memory
data_pointer = np.asarray(data_list)

print(data_copy)
print(data_pointer)

data_list[0] = 9
print(data_copy)
print(data_pointer)


[  0   1 100  42  13   7]
[  0   1 100  42  13   7]
[  0   1 100  42  13   7]
[  9   1 100  42  13   7]


In [33]:
# Create a Pandas dataframe from the tensor so that we can use the DataFrame's style object to 
# make a nice display.
#from fastbook import *
from fastai.vision.all import *
train_tensor = train_tensors[0]
#print(train_tensor)
image_df = pd.DataFrame(train_tensor)
image_df.style.set_properties(**{'font-size':'6pt'}).background_gradient('Greys')


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,3,18,18,18,126,136,175,26,166,255,247,127,0,0,0,0
6,0,0,0,0,0,0,0,0,30,36,94,154,170,253,253,253,253,253,225,172,253,242,195,64,0,0,0,0
7,0,0,0,0,0,0,0,49,238,253,253,253,253,253,253,253,253,251,93,82,82,56,39,0,0,0,0,0
8,0,0,0,0,0,0,0,18,219,253,253,253,253,253,198,182,247,241,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,80,156,107,253,253,205,11,0,43,154,0,0,0,0,0,0,0,0,0,0


In [None]:
image_df.style

In [None]:
# Turn our tensors into loaders.
train_loader = torch.utils.data.DataLoader(train, batch_size=64, shuffle=False)
test_loader = torch.utils.data.DataLoader(test, batch_size=64, shuffle=False)