<a href="https://colab.research.google.com/github/daniel-falk/ai-ml-principles-exercises/blob/main/ML-training/activeloop-deeplake/digit_classification_sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load and explore the dataset
The dataset used in this exersice is a digit recognition dataset from the *ActiveLoop* dataset hub.

In [None]:
!pip install deeplake

import numpy as np
import deeplake

In [None]:
ds_train = deeplake.load("hub://activeloop/mnist-train")
ds_test = deeplake.load("hub://activeloop/mnist-test")

In [None]:
ds_test.tensors.keys()

In [None]:
ds_test.images.shape

In [None]:
ds_test.labels.shape

In [None]:
ds_test.labels.info

In [None]:
from PIL import Image

i = 100
print(ds_test.labels[i])
Image.fromarray(ds_test.images[i].numpy(), "L").resize((100, 100), resample=Image.NEAREST)

In [None]:
# Show the maximum value in any of the first 100 image in the dataset
np.max(ds_test.images[:100])

# Train a model

* Train a SVM classifier
* Evaluate the classifier

In [None]:
# Randomly select 10k images to use for training
train_idx = np.random.choice(range(len(ds_train)), replace=False, size=10_000)

In [None]:
# Prefetch the datasets to RAM and reshape to flatten the images
# Note that we could index the training samples using train_idx before
# calling the numpy() method to download them, this would cause us to only
# download the images we are actually going to use. In this particular case
# where the images are tiny it is however faster to first download the full
# dataset and then slice it, this is due to the chunking of downloads in deeplake.
X_train = ds_train.images.numpy().reshape((len(ds_train), -1))[train_idx]
X_test = ds_test.images.numpy().reshape((len(ds_test), -1))
y_train = ds_train.labels[train_idx.tolist()].numpy().flatten()
y_test = ds_test.labels.numpy().flatten()

In [None]:
from sklearn import svm

In [None]:
model = svm.SVC()
model.fit(X_train, y_train)

In [None]:
# Predict the first ten samples from the test dataset
model.predict(X_test[0:10])

In [None]:
# Show the first ten ground truth values from the test dataset
y_test[0:10]

In [None]:
# Compare predicted and true
model.predict(X_test[0:10]) == y_test[0:10]

In [None]:
from sklearn import metrics

predicted = model.predict(X_test)
print(metrics.classification_report(y_test, predicted))

In [None]:
metrics.ConfusionMatrixDisplay.from_predictions(y_test, predicted)