In [None]:
from models.mlp import MLP
from tensor.tensor_scratch import TensorT
from tensor.tensor_scratch import TensorT
from core.ActivationFunctions import ActivationFunction
from core.LossFunctions import LossFunction


In [None]:

# --- Forward test ---
# logits: 3 classes, 2 samples
logits = TensorT([[2.0, 1.0],
                  [1.0, 3.0],
                  [0.1, 0.2]])

softmax_out = ActivationFunction.softmax(logits)

print("Softmax Output:")
for row in softmax_out.data:
    print(row)

# Each column (sample) should sum to 1
print("Column sums:", [sum(col) for col in zip(*softmax_out.data)])


# --- Backward test ---
# pretend gradient from loss is all ones (shape matches output)
grad_out = [[1.42195, -4.46124],
            [5.42451, 4.57461],
            [-8.43795, 5.52499]]

softmax_out.backward(grad=grad_out)

print("\nGradient wrt logits:")
for row in logits.grad:
    print(row)


In [None]:

# logits: 3 classes x 2 samples (same as before)
logits = TensorT([[2.0, 1.0],
                  [1.0, 3.0],
                  [0.1, 0.2]])
S = ActivationFunction.softmax(logits)  # (3,2)

# one-hot labels per column: targets [class0, class1]
Y = TensorT([[1, 0],   # class 0 for sample 1
             [0, 1],   # class 1 for sample 2
             [0, 0]])

m = S.shape[1]  # batch size (=2)

# Upstream gradient from CE w.r.t probabilities: dL/dS = -(Y / S) / m
# (we're simulating CE backward input to softmax)
grad_wrt_probs = []
for i in range(S.shape[0]):
    row = []
    for j in range(S.shape[1]):
        s_ij = S.data[i][j]
        y_ij = Y.data[i][j]
        row.append(-(y_ij / max(s_ij, 1e-15)) / m)
    grad_wrt_probs.append(row)

# Backprop through softmax with our simulated upstream
S.backward(grad=grad_wrt_probs)

# Expected gradient at logits: (S - Y) / m
expected = []
for i in range(S.shape[0]):
    row = []
    for j in range(S.shape[1]):
        row.append((S.data[i][j] - Y.data[i][j]) / m)
    expected.append(row)

print("Grad wrt logits (computed):")
for r in logits.grad: print(r)
print("\nExpected (S - Y)/m:")
for r in expected: print(r)

# Quick numeric check (small tolerance)
ok = True
tol = 1e-8
for i in range(len(expected)):
    for j in range(len(expected[0])):
        if abs(logits.grad[i][j] - expected[i][j]) > tol:
            ok = False
print("\nMatch within tolerance? ->", ok)


In [None]:

# 3 classes, 1 sample (column)
logits = TensorT([[2.0],
                  [1.0],
                  [0.1]])
S = ActivationFunction.softmax(logits)

# Upstream grad that nudges class 0 only (like loss pushing true class up)
grad_wrt_probs = [[-1.0], [0.0], [0.0]]  # shape (3,1)
S.backward(grad=grad_wrt_probs)

print("Softmax probs:", [row[0] for row in S.data])     # just to see s
print("Grad wrt logits:", [row[0] for row in logits.grad])
# Expect: class 0 negative; other classes positive (mass shifts from others to the true class)


In [None]:
from tensor.tensor_scratch import TensorT
import math


In [None]:
def softmax_old(z: TensorT) -> TensorT:
    # ----- forward (numerically stable) -----
    z_max_per_col = [max(col) for col in zip(*z.data)]
    z_stable_data = []
    for row in z.data:
        z_stable_data.append([x - z_max_per_col[j] for j, x in enumerate(row)])

    exp_data = z._apply_unary(z_stable_data, math.exp)
    sum_exp_per_col = [sum(col) for col in zip(*exp_data)]

    result_data = []
    for row in exp_data:
        result_data.append([x / sum_exp_per_col[j] for j, x in enumerate(row)])

    out = TensorT(result_data, _op='softmax_old', _parent=(z,))

    # ----- backward (diag-only; BUGGY for multiclass) -----
    def backward_fn(grad_op):
        s = result_data  # (C, m)
        grad_in = z._apply_elementwise(grad_op, s, lambda g, s_val: g * s_val * (1 - s_val))
        return (grad_in,)
    out.backward_fn = backward_fn
    return out


In [None]:
def softmax_new(z: TensorT) -> TensorT:
    # ----- forward (numerically stable) -----
    z_max_per_col = [max(col) for col in zip(*z.data)]
    z_stable_data = []
    for row in z.data:
        z_stable_data.append([x - z_max_per_col[j] for j, x in enumerate(row)])

    exp_data = z._apply_unary(z_stable_data, math.exp)
    sum_exp_per_col = [sum(col) for col in zip(*exp_data)]

    result_data = []
    for row in exp_data:
        result_data.append([x / sum_exp_per_col[j] for j, x in enumerate(row)])

    out = TensorT(result_data, _op='softmax_new', _parent=(z,))

    # ----- backward (CORRECT VJP) -----
    def backward_fn(grad_op):
        s = result_data                  # (C, m) nested lists
        C = len(s)
        M = len(s[0]) if C else 0
        grad_in = [[0.0 for _ in range(M)] for _ in range(C)]
        for j in range(M):
            dot = sum(grad_op[i][j] * s[i][j] for i in range(C))  # g^T s
            for i in range(C):
                grad_in[i][j] = s[i][j] * (grad_op[i][j] - dot)
        return (grad_in,)
    out.backward_fn = backward_fn
    return out


In [None]:
# logits: 3 classes x 2 samples
Z = TensorT([[2.0, 1.0],
             [1.0, 3.0],
             [0.1, 0.2]])

S_old = softmax_old(Z)
S_new = softmax_new(Z)

print("Old softmax column sums:", [sum(col) for col in zip(*S_old.data)])
print("New softmax column sums:", [sum(col) for col in zip(*S_new.data)])


In [None]:
print("\nOld Softmax Output:", S_old.data)
print("New Softmax Output:", S_new.data)

In [None]:
# Upstream grad of ones
G = [[1.0, 1.0],
     [1.0, 1.0],
     [1.0, 1.0]]

# old
Z1 = TensorT([[2.0, 1.0],[1.0, 3.0],[0.1, 0.2]])
S1 = softmax_old(Z1)
S1.backward(grad=G)
print("Old: grad wrt logits (ones upstream):")
for r in Z1.grad: print(r)

# new
Z2 = TensorT([[2.0, 1.0],[1.0, 3.0],[0.1, 0.2]])
S2 = softmax_new(Z2)
S2.backward(grad=G)
print("\nNew: grad wrt logits (ones upstream):")
for r in Z2.grad: print(r)


In [None]:
# logits again
Z = TensorT([[2.0, 1.0],
             [1.0, 3.0],
             [0.1, 0.2]])

# one-hot labels per column: sample1->class0, sample2->class1
Y = TensorT([[1, 0],
             [0, 1],
             [0, 0]])

# -------- OLD --------
Z_old = TensorT([row[:] for row in Z.data])
S_old = softmax_old(Z_old)

m = S_old.shape[1]
# upstream from CE: dL/dS = -(Y / S) / m   (elementwise)
G_old = []
for i in range(S_old.shape[0]):
    row = []
    for j in range(S_old.shape[1]):
        s_ij = S_old.data[i][j]
        y_ij = Y.data[i][j]
        row.append(-(y_ij / max(s_ij, 1e-15)) / m)
    G_old.append(row)

S_old.backward(grad=G_old)

# expected logits grad: (S - Y)/m
expected_old = [[(S_old.data[i][j] - Y.data[i][j]) / m
                 for j in range(S_old.shape[1])]
                for i in range(S_old.shape[0])]

print("OLD softmax: grad wrt logits")
for r in Z_old.grad: print(r)
print("\nExpected (S - Y)/m (using OLD S):")
for r in expected_old: print(r)


# -------- NEW --------
Z_new = TensorT([row[:] for row in Z.data])
S_new = softmax_new(Z_new)

m = S_new.shape[1]
G_new = []
for i in range(S_new.shape[0]):
    row = []
    for j in range(S_new.shape[1]):
        s_ij = S_new.data[i][j]
        y_ij = Y.data[i][j]
        row.append(-(y_ij / max(s_ij, 1e-15)) / m)
    G_new.append(row)

S_new.backward(grad=G_new)

expected_new = [[(S_new.data[i][j] - Y.data[i][j]) / m
                 for j in range(S_new.shape[1])]
                for i in range(S_new.shape[0])]

print("\nNEW softmax: grad wrt logits")
for r in Z_new.grad: print(r)
print("\nExpected (S - Y)/m (using NEW S):")
for r in expected_new: print(r)

# quick check flags
tol = 1e-8
def close(A,B):
    for i in range(len(A)):
        for j in range(len(A[0])):
            if abs(A[i][j]-B[i][j])>tol: return False
    return True

print("\nMATCH OLD?  ", close(Z_old.grad, expected_old))
print("MATCH NEW?  ", close(Z_new.grad, expected_new))


In [None]:
# single sample (3x1), upstream nudges class 0 only
Z_a = TensorT([[2.0],[1.0],[0.1]])
S_a_old = softmax_old(Z_a)
S_a_old.backward(grad=[[-1.0],[0.0],[0.0]])
print("OLD grad wrt logits (3x1):", [r[0] for r in Z_a.grad])

Z_b = TensorT([[2.0],[1.0],[0.1]])
S_b_new = softmax_new(Z_b)
S_b_new.backward(grad=[[-1.0],[0.0],[0.0]])
print("NEW grad wrt logits (3x1):", [r[0] for r in Z_b.grad])


In [24]:
a, b, c, d = 10, 3, 3, 5

lhs = -a * b / c / d
rhs = -a * b / c * d
mhs = (-a * b) / (c * d)

lhs, rhs, mhs

(-2.0, -50.0, -2.0)