In [1]:
# Implementation of Softmax Regression from Scratch
import tensorflow as tf
from IPython import display
from d2l import tensorflow as d2l

# Loading the dataset and partitioning the dataset into minibatches
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)

2022-02-04 12:39:29.699382: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Initializing Model Parameters
# In this section, we will flatten each image, treating them as vectors of length 784.
# In the future, we will talk about more sophisticated strategies for exploiting the spatial structure in images. (I think they are talking about the CNN.)
num_inputs = 784
num_outputs = 10

# Our weights will constitute a 784 * 10 matrix and the biases will constitute a 1 * 10 row vector.
# As with linear regression, we will initialize weights 'w' with Gaussian noise and our biases to take the initial value 0.
W = tf.Variable(tf.random.normal(shape = (num_inputs, num_outputs),
mean=0, stddev=0.01))
b = tf.Variable(tf.zeros(num_outputs))

In [3]:
# Defining the Softmax Operation
# 'keepdims=True' option pertains the original structure of the object that is used for the operation.
X = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
tf.reduce_sum(X, 0, keepdims=True), tf.reduce_sum(X, 1, keepdims=True)

(<tf.Tensor: shape=(1, 3), dtype=float32, numpy=array([[5., 7., 9.]], dtype=float32)>,
 <tf.Tensor: shape=(2, 1), dtype=float32, numpy=
 array([[ 6.],
        [15.]], dtype=float32)>)

In [4]:
# The operation seems to be made on the 2-dimensional object.
def softmax(X):
    X_exp = tf.exp(X) # exponentiate all the values
    partition = tf.reduce_sum(X_exp, 1, keepdims=True) # It can be understood as derivation of the normalizing constant.
    return X_exp / partition # The broadcasting mechanism is applied here. I think it can be the reason why we made the option 'keepdims=True'!

In [5]:
X = tf.random.normal((2, 5), 0, 1) # So, in this case, it can be understood as O matrix in the website. So the classification problem with 5 outputs and 2 observations are here.
X_prob = softmax(X)
X_prob, tf.reduce_sum(X_prob, 1)

(<tf.Tensor: shape=(2, 5), dtype=float32, numpy=
 array([[0.40271994, 0.04838905, 0.27165487, 0.13820343, 0.1390327 ],
        [0.27239957, 0.05196008, 0.4103868 , 0.14165089, 0.12360267]],
       dtype=float32)>,
 <tf.Tensor: shape=(2,), dtype=float32, numpy=array([1., 1.], dtype=float32)>)

In [6]:
# Defining the Model
def net(X):
    return softmax(tf.matmul(tf.reshape(X, (-1, W.shape[0])), W) + b)

In [7]:
# Defining the Loss Function
# This code chunk is for understanding the role of function 'boolean_mask()'.
y_hat = tf.constant([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5]])
y = tf.constant([0, 2])
tf.boolean_mask(y_hat, tf.one_hot(y, depth=y_hat.shape[-1])) # recall that y_hat is of shape (2, 3)

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.1, 0.5], dtype=float32)>

In [8]:
def cross_entropy(y_hat, y):
    return -tf.math.log(tf.boolean_mask(
        y_hat, tf.one_hot(y, depth=y_hat.shape[-1])
    ))

cross_entropy(y_hat, y)

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([2.3025851, 0.6931472], dtype=float32)>

In [9]:
# Classification Accuracy
# The classification accuracy is the fraction of all predictions that are correct.
# Although it can be difficult to optimize accuracy directly, 
# it is often the performance measure that we care most about.
def accuracy(y_hat, y): #@save
    """"Compute the number of correct predictions."""
    if len(y_hat.shape) > 1 and y_hat.shape[1] > 1: # when {the number of obs is more than 1} & {output with multicategories}
        y_hat = tf.argmax(y_hat, axis=1) # Aha. Now I got it. It returns the 'INDEX' with the largest value across axes of a tensor.  
    cmp = tf.cast(y_hat, y.dtype) == y
    return float(tf.reduce_sum(tf.cast(cmp, y.dtype)))

In [10]:
accuracy(y_hat, y) / len(y)

0.5

In [11]:
class Accumulator: #@save
    """"For accumulating sums over 'n' variables."""
    def __init__(self, n):
        self.data = [0.0] * n

    def add(self, *args):
        self.data = [a + float(b) for a, b in zip(self.data, args)]

    def reset(self):
        self.data = [0.0] * len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [26]:
def evaluate_accuracy(net, data_iter): #@save
    """"Compute the accuracy for a model on a dataset."""
    metric = Accumulator(2) # No. of correct predictions, no. of predictions
    for X, y in data_iter:
        metric.add(accuracy(net(X), y), d2l.size(y))
        #break
    return metric[0] / metric[1]

In [28]:
evaluate_accuracy(net, test_iter)

0.0976

In [31]:
# Training
# First, we define a ftn to train for one epoch.
# 이미 parameter initialization은 이루어졌다고 가정하고 있는 function일지도 모르겠습니다. (코드 이해하기 전 생각)
# dataset -> into minibatches ('train_iter') -> parameter(s) initialization -> model construction ('net') ->  loss function selection ('loss') -> optimization method ('updater')
def train_epoch_ch3(net, train_iter, loss, updater): #@save
    """"The training loop defined in Chapter 3."""
    # Sum of training loss, sum of training accuracy, no. of examples
    metric = Accumulator(3)
    for X, y in train_iter:
        # Compute gradients and update parameters
        with tf.GradientTape() as tape:
            y_hat = net(X)
            # Keras implementations for loss takes (labels, predictions)
            # instead of (predictions, labels) that users might implement
            # in this book, e.g. 'cross_entropy' that we implemented above
            if isinstance(loss, tf.keras.losses.Loss):
                l = loss(y, y_hat) # 그냥 이게 정의여서 그렇습니다 (tf.keras.losses.Loss에 정의된 loss들이 그렇게 정의되어있다는 의미).
            else:
                l = loss(y_hat, y)
        if isinstance(updater, tf.keras.optimizers.Optimizer):
            params = net.trainable_variables
            grads = tape.gradient(l, params)
            updater.apply_gradients(zip(grads, params))
        else:
            # 이 부분은 아랫부분을 보면서 이해해야 할 것 같습니다.
            updater(X.shape[0], tape.gradient(l, updater.params))
        # Keras LOSS BY DEFAULT RETURNS THE AVERAGE LOSS IN A BATCH
        l_sum = l * float(tf.size(y)) if isinstance(
            loss, tf.keras.losses.Loss
        ) else tf.reduce_sum(l)
        metric.add(l_sum, accuracy(y_hat, y), tf.size(y))
    # Return training loss and training accuracy (can be understood as THEIR RESPECTIVE SAMPLE MEANS)
    return metric[0] / metric[2], metric[1] / metric[2]

         

In [48]:
# Before showing the implementation of the training function, we define a utility class that plot data in animation.
class Animator: #@save
    """"For plotting data in animation."""
    def __init__(self, xlabel=None, ylabel=None, legend=None, xlim=None, ylim=None, 
    xscale='linear', yscale='linear', 
    fmts=('-', 'm--', 'g-.', 'r:'), nrows=1, ncols=1, figsize=(3.5,2.5)):
        # Incrementally plot multiple lines
        if legend is None:
            legend = []
        d2l.use_svg_display()
        self.fig, self.axes = d2l.plt.subplots(nrows, ncols, figsize=figsize)
        if nrows * ncols == 1:
            self.axes = [self.axes,]
        # Use a lambda function to capture arguments
        self.config_axes = lambda: d2l.set.axes(
            self.axes[0], xlabel, ylabel, xlim, ylim, xscale, yscale, legend
        )
        self.X, self.Y, self.fmts = None, None, fmts

    def add(self, x, y):
        # Add multiple data points into the figure
        if not hasattr(y, "__len__"):
            y = [y]
        n = len(y)
        if not hasattr(x, "__len__"):
            x = [x] * n
        if not self.X:
            self.X = [[] for _ in range(n)]
        if not self.Y:
            self.Y = [[] for _ in range(n)]
        for i, (a, b) in enumerate(zip(x, y)):
            if a is not None and b is not None:
                self.X[i].append(a)
                self.Y[i].append(b)
        self.axes[0].cla()
        for x, y, fmt in zip(self.X, self.Y, self.fmts):
            self.axes[0].plot(x, y, fmt) 
        self.config_axes()       
        display.display(self.fig)
        display.clear_output(wait=True)
        

In [60]:
#fig, axs = d2l.plt.subplots(2,2)
#axs[0].shape