In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.datasets import make_blobs




The softmax function can be written:
$$a_j = \frac{e^{z_j}}{ \sum_{k=1}^{N}{e^{z_k} }} \tag{1}$$
The output $\mathbf{a}$ is a vector of length N, so for softmax regression, you could also write:
\begin{align}
\mathbf{a}(x)
\begin{bmatrix}
P(y = 1 | \mathbf{x}; \mathbf{w},b) \\
\vdots \\
P(y = N | \mathbf{x}; \mathbf{w},b)
\end{bmatrix}
\frac{1}{ \sum_{k=1}^{N}{e^{z_k} }}
\begin{bmatrix}
e^{z_1} \\
\vdots \\
e^{z_{N}} \\
\end{bmatrix} \tag{2}
\end{align}

In [2]:
# Creating my Softmax Function
def softmax(z):
    ez = np.exp(z)
    sm = ez / np.sum(ez)
    return sm

The loss function associated with Softmax, the cross-entropy loss, is:
\begin{equation}
  L(\mathbf{a},y)=\begin{cases}
    -log(a_1), & \text{if $y=1$}.\\
        &\vdots\\
     -log(a_N), & \text{if $y=N$}
  \end{cases} \tag{3}
\end{equation}

Where y is the target category for this example and $\mathbf{a}$ is the output of a softmax function. In particular, the values in $\mathbf{a}$ are probabilities that sum to one.
>**Recall:** In this course, Loss is for one example while Cost covers all examples. 
 
 
Note in (3) above, only the line that corresponds to the target contributes to the loss, other lines are zero. To write the cost equation we need an 'indicator function' that will be 1 when the index matches the target and zero otherwise. 
    $$\mathbf{1}\{y == n\} = =\begin{cases}
    1, & \text{if $y==n$}.\\
    0, & \text{otherwise}.
  \end{cases}$$
Now the cost is:
\begin{align}
J(\mathbf{w},b) = -\frac{1}{m} \left[ \sum_{i=1}^{m} \sum_{j=1}^{N}  1\left\{y^{(i)} == j\right\} \log \frac{e^{z^{(i)}_j}}{\sum_{k=1}^N e^{z^{(i)}_k} }\right] \tag{4}
\end{align}

Where $m$ is the number of examples, $N$ is the number of outputs. This is the average of all the losses.


In [3]:
# make  dataset for example
centers = [[-5, 2], [-2, -2], [1, 2], [5, -2]]
X_train, y_train = make_blobs(n_samples=2000, centers=centers, cluster_std=1.0,random_state=30)

In [6]:
# Generating Model
model = Sequential(
    [
        Dense(25, activation='relu', name='layer1'),
        Dense(15, activation='relu', name='layer2'),
        Dense(4 , activation='sigmoid', name='layer3')
    ]
)




In [7]:
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001)
)

In [8]:
model.fit(
    X_train, y_train,
    epochs=10
)

Epoch 1/10

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x19bc6c16a90>

In [9]:
p_nonpreferred = model.predict(X_train)
print(p_nonpreferred [:2])
print("largest value", np.max(p_nonpreferred), "smallest value", np.min(p_nonpreferred))

[[0.34081835 0.05275655 0.9472803  0.30914465]
 [0.99606645 0.6663822  0.06256275 0.011466  ]]
largest value 0.999976 smallest value 3.8892226e-06


In [10]:
for i in range(5):
    print( f"{p_nonpreferred[i]}, category: {np.argmax(p_nonpreferred[i])}")

[0.34081835 0.05275655 0.9472803  0.30914465], category: 2
[0.99606645 0.6663822  0.06256275 0.011466  ], category: 0
[0.98589414 0.71075076 0.10981676 0.02571319], category: 0
[0.54264534 0.9883067  0.04006017 0.1688556 ], category: 1
[0.68765503 0.00207645 0.99371547 0.04023519], category: 2


## Optimized Solution

In [11]:
preferred_model = Sequential(
    [ 
        Dense(25, activation = 'relu'),
        Dense(15, activation = 'relu'),
        Dense(4, activation = 'linear')   #<-- Note
    ]
)

In [12]:
preferred_model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001)
)

In [13]:
preferred_model.fit(
    X_train, y_train,
    epochs=10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x19bc8161950>

In [14]:
p_preferred = preferred_model.predict(X_train)
print(f"two example output vectors:\n {p_preferred[:2]}")
print("largest value", np.max(p_preferred), "smallest value", np.min(p_preferred))

two example output vectors:
 [[-1.0803182  -2.685932    3.2586536  -0.05245831]
 [ 5.513552    0.45815724 -4.297861   -3.355855  ]]
largest value 10.480713 smallest value -10.253984


The output predictions are not probabilities!
If the desired output are probabilities, the output should be be processed by a softmax

In [15]:
sm_preferred = tf.nn.softmax(p_preferred).numpy()
print(f"two example output vectors:\n {sm_preferred[:2]}")
print("largest value", np.max(sm_preferred), "smallest value", np.min(sm_preferred))

two example output vectors:
 [[1.2403165e-02 2.4901382e-03 9.5043892e-01 3.4667820e-02]
 [9.9347258e-01 6.3332357e-03 5.4464486e-05 1.3970793e-04]]
largest value 0.9999896 smallest value 3.464973e-08


To select the most likely category, the softmax is not required. One can find the index of the largest output using np.argmax()

In [16]:
for i in range(5):
    print( f"{p_preferred[i]}, category: {np.argmax(p_preferred[i])}")

[-1.0803182  -2.685932    3.2586536  -0.05245831], category: 2
[ 5.513552    0.45815724 -4.297861   -3.355855  ], category: 0
[ 4.1857724  0.6524269 -3.239903  -2.8281155], category: 0
[-2.004035   3.3781226 -1.2430665 -2.4520552], category: 1
[ 1.4730426 -3.3420036  6.623793  -1.6249774], category: 2
