In [1]:
import os, io, botocore
import boto3

TRAIN_CSV = "data/mnist_train.csv"
TEST_CSV = "data/mnist_test.csv"

# Read from environment
endpoint = os.getenv("AWS_S3_ENDPOINT")
bucket = os.getenv("AWS_S3_BUCKET")
access_key = os.getenv("AWS_ACCESS_KEY_ID")
secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
region = os.getenv("AWS_DEFAULT_REGION", "us-east-1")

# Create an S3 client compatible with MinIO
s3 = boto3.client(
    "s3",
    endpoint_url=endpoint,
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key,
    region_name=region,
)

# Optional: avoid the ?location call causing you trouble by telling boto3 the region upfront (done above)
# and not using bucket location lookups.

import tempfile
import pathlib

try:
    with tempfile.NamedTemporaryFile(delete=False) as f:
        # Get the pathlib.Path object
        print("download test data from S3 into temp file: ", f.name)
        test_path = pathlib.Path(f.name)
        # Pass the open file object
        s3.download_fileobj(bucket, TEST_CSV, f)

        # Flush to ensure data is written
        f.flush()

        print("Got test csv file:", test_path)

    with tempfile.NamedTemporaryFile(delete=False) as f:
        # Get the pathlib.Path object
        print("download train data from S3 into temp file: ", f.name)
        train_path = pathlib.Path(f.name)
        # Pass the open file object
        s3.download_fileobj(bucket, TEST_CSV, f)

        # Flush to ensure data is written
        f.flush()

        print("Got train csv file:", train_path)

except s3.exceptions.NoSuchKey:
    print("File not found")
except botocore.exceptions.ClientError as e:
    print("S3 error:", e)

download test data from S3 into temp file:  /tmp/tmpjqkziusx
Got test csv file: /tmp/tmpjqkziusx
download train data from S3 into temp file:  /tmp/tmp3h84i448
Got train csv file: /tmp/tmp3h84i448


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf

# Read CSV with header row
train_df = pd.read_csv(train_path)   # header inferred automatically
test_df = pd.read_csv(test_path)

# Separate label and pixel columns
train_labels = train_df["label"].astype(np.int32).values
train_features = train_df.drop(columns=["label"]).astype(np.float32).values

test_labels = test_df["label"].astype(np.int32).values
test_features = test_df.drop(columns=["label"]).astype(np.float32).values

# Reshape and normalize
train_features = train_features.reshape((-1, 28, 28, 1)) / 255.0
test_features = test_features.reshape((-1, 28, 28, 1)) / 255.0

# --- Check data integrity
print("\nTrain features shape:", train_features.shape)
print("Train labels shape:  ", train_labels.shape)
print("Unique labels:", np.unique(train_labels))

print("\nTest features shape:", test_features.shape)
print("Test labels shape:   ", test_labels.shape)
print("Unique labels:", np.unique(test_labels))

# Show first few feature samples (flattened)
print("\nSample feature array (first image, flattened 10 pixels):\n", train_features[0].flatten()[:10])
print("\nSample label for first image:", train_labels[0])

2025-10-10 06:06:55.654658: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760076415.998767     314 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760076416.050523     314 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1760076416.916717     314 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1760076416.916749     314 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1760076416.916753     314 computation_placer.cc:177] computation placer alr


Train features shape: (10000, 28, 28, 1)
Train labels shape:   (10000,)
Unique labels: [0 1 2 3 4 5 6 7 8 9]

Test features shape: (10000, 28, 28, 1)
Test labels shape:    (10000,)
Unique labels: [0 1 2 3 4 5 6 7 8 9]

Sample feature array (first image, flattened 10 pixels):
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

Sample label for first image: 7


In [3]:
# Build model
print("building model")
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Conv2D(32, (3,3), input_shape=(28,28,1)))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2,2), strides=2))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(10, activation='softmax'))

print("compile model")
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

print(model.summary())

# Train and evaluate
model.fit(train_features, train_labels, epochs=3, verbose=1)
model.evaluate(test_features, test_labels)

building model
compile model


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-10-10 06:07:15.350649: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


None
Epoch 1/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 24ms/step - accuracy: 0.8572 - loss: 0.4765
Epoch 2/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 23ms/step - accuracy: 0.9352 - loss: 0.2100
Epoch 3/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 23ms/step - accuracy: 0.9537 - loss: 0.1496
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9793 - loss: 0.0732


[0.07317521423101425, 0.9793000221252441]

In [11]:
# Convert Keras model -> ONNX (robust across tf2onnx versions) and upload to MinIO/S3

import os
import tensorflow as tf
import tf2onnx
import boto3, botocore

# Define serving function + signature
INPUT_SIG = [tf.TensorSpec([None, 28, 28, 1], tf.float32, name="input")]

@tf.function(input_signature=INPUT_SIG)
def serving_fn(x):
    y = model(x, training=False)
    return {"probabilities": y}   # name your output

# Convert to ONNX (try API variant that your tf2onnx supports)
proto = None
try:
    # Preferred path for many versions: pass the tf.function + input_signature
    proto, _ = tf2onnx.convert.from_function(
        serving_fn,
        input_signature=INPUT_SIG,
        opset=13,
        output_path=None
    )
except Exception as e1:
    # Fallback: some builds accept a ConcreteFunction positional arg
    concrete = serving_fn.get_concrete_function()
    proto, _ = tf2onnx.convert.from_function(
        concrete,
        opset=13,
        output_path=None
    )

onnx_bytes = proto.SerializeToString()
print(f"✅ ONNX ready, size {len(onnx_bytes):,} bytes")

onnx_key = "models/mnist/model.onnx"
s3.put_object(Bucket=bucket, Key=onnx_key, Body=onnx_bytes, ContentType="application/octet-stream")
print(f"📤 Uploaded: s3://{bucket}/{onnx_key}")

✅ ONNX ready, size 1,391,691 bytes
📤 Uploaded: s3://mnist-ml/models/mnist/model.onnx


I0000 00:00:1760082196.536422     314 devices.cc:67] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
I0000 00:00:1760082196.536533     314 single_machine.cc:374] Starting new session
I0000 00:00:1760082196.574635     314 devices.cc:67] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
I0000 00:00:1760082196.574760     314 single_machine.cc:374] Starting new session


In [12]:
import os, io, tempfile

# 1) Save Keras native single-file format (.keras)
keras_path = "mnist.keras"
model.save(keras_path)  # Keras 3 native format (recommended single-file)
print(f"✅ Wrote {keras_path}")

keras_key = "models/mnist/model.keras"

with open(keras_path, "rb") as f:
    s3.put_object(
        Bucket=bucket,
        Key=keras_key,   # adjust path if you want versioning
        Body=f.read(),
        ContentType="application/octet-stream",
    )
print(f"📤 Uploaded: s3://{bucket}/{keras_key}")

✅ Wrote mnist.keras
📤 Uploaded: s3://mnist-ml/models/mnist/model.keras


In [13]:
# check model files are there
resp = s3.list_objects_v2(Bucket="mnist-ml", Prefix="models/mnist")
for o in resp.get("Contents", []):
    print(o["Key"])

models/mnist/model.keras
models/mnist/model.onnx
