In [1]:
using Pkg
Pkg.status()

[32m[1mStatus[22m[39m `~/git/Julia_ML_training/unit5/Project.toml`
[32m⌃[39m [90m[cbdf2221] [39mAlgebraOfGraphics v0.10.7
  [90m[336ed68f] [39mCSV v0.10.15
[32m⌃[39m [90m[13f3f980] [39mCairoMakie v0.13.9
  [90m[a93c6f00] [39mDataFrames v1.7.0
  [90m[31c24e10] [39mDistributions v0.25.120
[32m⌃[39m [90m[587475ba] [39mFlux v0.16.3
  [90m[38e38edf] [39mGLM v1.9.0
  [90m[60bf3e95] [39mGLPK v1.2.1
  [90m[09f84164] [39mHypothesisTests v0.11.5
  [90m[4076af6c] [39mJuMP v1.26.0
  [90m[23fbe1c1] [39mLatexify v0.16.8
[32m⌃[39m [90m[b2108857] [39mLux v1.13.0
  [90m[eb30cadb] [39mMLDatasets v0.7.18
  [90m[add582a8] [39mMLJ v0.20.8
[33m⌅[39m [90m[ee78f7c6] [39mMakie v0.22.9
  [90m[ff71e718] [39mMixedModels v4.35.2
  [90m[6f286f6a] [39mMultivariateStats v0.10.3
  [90m[636a865e] [39mNearestNeighborModels v0.2.3
[32m⌃[39m [90m[429524aa] [39mOptim v1.12.0
  [90m[92933f4c] [39mProgressMeter v1.10.4
  [90m[ce6b1742] [39mRDatasets v0.7.7
  [90m[291

In [2]:
using Flux
using Flux: onehotbatch, onecold, logitcrossentropy, DataLoader
using MLDatasets
using Statistics
using Random
using ProgressMeter

In [3]:
Random.seed!(0)

# Hyperparameters
learning_rate = 0.001
batch_size = 128
num_epochs = 10;

In [4]:
train_x_raw, train_y_raw = MNIST(split=:train)[:]
test_x_raw,  test_y_raw  = MNIST(split=:test)[:]
size(train_x_raw), size(train_y_raw)

((28, 28, 60000), (60000,))

In [5]:
# The neural network expects vector inputs, not 2D images.
# We also convert the data to Float32 for performance.
# Final shape will be (features, num_samples), e.g., (784, 60000)
function preprocess_features(x)
    return Float32.(reshape(x, 28*28, :))
end

preprocess_features (generic function with 1 method)

In [6]:
train_x = preprocess_features(train_x_raw)
test_x = preprocess_features(test_x_raw)
size(train_x), size(test_x)

((784, 60000), (784, 10000))

In [7]:
# The crossentropy loss function expects labels to be "one-hot" encoded.
# E.g., the label '2' becomes a vector: [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
# The range 0:9 specifies all possible classes.
train_y = onehotbatch(train_y_raw, 0:9)
test_y = onehotbatch(test_y_raw, 0:9)
size(train_y), size(test_y)

((10, 60000), (10, 10000))

In [8]:
# Let's consider the first label as an example
train_y_raw[1], train_y[:,1] 

(5, Bool[0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [9]:
# Create a DataLoader to automatically handle batching and shuffling
train_loader = DataLoader((train_x, train_y), batchsize=batch_size, shuffle=true)

469-element DataLoader(::Tuple{Matrix{Float32}, OneHotArrays.OneHotMatrix{UInt32, Vector{UInt32}}}, shuffle=true, batchsize=128)
  with first element:
  (784×128 Matrix{Float32}, 10×128 OneHotMatrix(::Vector{UInt32}) with eltype Bool,)

In [10]:
# A simple 3-layer sequential model (2 hidden layers, 1 output layer)
model = Chain(
      Dense(28*28, 128, relu),  # Input: 784 -> Hidden 1: 128 neurons, with ReLU activation
      Dense(128, 64, relu),     # Hidden 1: 128 -> Hidden 2: 64 neurons, with ReLU activation
      Dense(64, 10)             # Hidden 2: 64 -> Output: 10 neurons (for digits 0-9)
)
# Note: No activation on the last layer. `crossentropy` expects raw logits for stability.

Chain(
  Dense(784 => 128, relu),              [90m# 100_480 parameters[39m
  Dense(128 => 64, relu),               [90m# 8_256 parameters[39m
  Dense(64 => 10),                      [90m# 650 parameters[39m
) [90m                  # Total: 6 arrays, [39m109_386 parameters, 427.594 KiB.

In [11]:
# `logitcrossentropy` is the standard loss for multi-class classification.
# It works on the raw model outputs (logits).
loss(m, x, y) = logitcrossentropy(m(x), y)

loss (generic function with 1 method)

In [12]:
# Set up the optimizer with the model's parameters
opt_state = Flux.setup(ADAM(learning_rate), model);

In [13]:
println("\nStarting training...")

# Training Loop
for epoch in 1:num_epochs
    # Flux.train! handles the entire training step for one epoch:
    # it iterates through the `train_loader`, calculates loss,
    # computes gradients, and updates the model parameters.
    @time Flux.train!(loss, model, train_loader, opt_state)
    
    # Optional: Calculate and print accuracy on the test set after each epoch
    # Get model's predictions (logits)
    y_hat_logits = model(test_x)
    # Convert logits to class labels (0-9)
    y_hat_labels = onecold(y_hat_logits, 0:9)
    # Compare with true labels and calculate the mean
    current_accuracy = mean(y_hat_labels .== test_y_raw)
    
    println("Epoch $epoch: Test Accuracy = ", round(current_accuracy * 100, digits=2), "%")
end

println("\nTraining complete!")


Starting training...
  6.735494 seconds (34.47 M allocations: 2.218 GiB, 3.46% gc time, 93.61% compilation time)
Epoch 1: Test Accuracy = 95.41%
  0.459711 seconds (152.44 k allocations: 558.265 MiB, 6.93% gc time)
Epoch 2: Test Accuracy = 96.35%
  0.412442 seconds (152.44 k allocations: 558.265 MiB, 5.55% gc time)
Epoch 3: Test Accuracy = 97.12%
  0.399418 seconds (152.44 k allocations: 558.265 MiB, 4.65% gc time)
Epoch 4: Test Accuracy = 97.37%
  0.397601 seconds (152.44 k allocations: 558.265 MiB, 4.39% gc time)
Epoch 5: Test Accuracy = 97.22%
  0.438921 seconds (152.44 k allocations: 558.265 MiB, 4.41% gc time)
Epoch 6: Test Accuracy = 97.25%
  0.398900 seconds (152.44 k allocations: 558.265 MiB, 4.45% gc time)
Epoch 7: Test Accuracy = 97.67%
  0.392282 seconds (152.44 k allocations: 558.265 MiB, 3.94% gc time)
Epoch 8: Test Accuracy = 97.73%
  0.401077 seconds (152.44 k allocations: 558.265 MiB, 4.03% gc time)
Epoch 9: Test Accuracy = 97.8%
  0.391978 seconds (152.44 k allocation

In [14]:
# Let's predict a single image from the test set
println("\nExample Prediction:")
index = 42 # Let's test the 42nd image
single_image = test_x[:, index]
true_label = test_y_raw[index]
# Get the model's raw output (logits) for this single image
prediction_logits = model(single_image)
# Find the class with the highest score
predicted_label = onecold(prediction_logits, 0:9)[1]

println("  - True Label:      ", true_label)
println("  - Predicted Label: ", predicted_label)
if predicted_label == true_label
    println("  - Result: Correct! ✅")
else
    println("  - Result: Incorrect! ❌")
end


Example Prediction:
  - True Label:      7
  - Predicted Label: 7
  - Result: Correct! ✅


In [15]:
# Get final predictions and calculate accuracy
final_predictions = onecold(model(test_x), 0:9)
final_accuracy = mean(final_predictions .== test_y_raw)
println("-------------------------------------------")
println("Final Test Accuracy: ", round(final_accuracy * 100, digits=2), "%")
println("-------------------------------------------")

-------------------------------------------
Final Test Accuracy: 97.74%
-------------------------------------------
