In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

from keras.models import Model
from keras.layers import (
    Dense,
    Input,
    Conv2D
)

from skimage.io import imread
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score

%matplotlib inline

In [None]:
import sys
sys.path.append('..')

from utils.s3_class import S3Functions

s3_funcs = S3Functions(bucket_name='jdgallegoq-pinacle')

In [None]:
import keras
import tensorflow as tf
print('keras version: ', keras.__version__)
print('tensorflow version: ', tf.__version__)

In [None]:
# seeds to stop (try) random behaviour
seed = 42
rng = np.random.RandomState(seed)

In [None]:
# data path
DATA_PATH = 'mnist/'

In [None]:
train = pd.read_csv(s3_funcs.read_object(key=DATA_PATH+'train.csv'))
train.info()

In [None]:
# read images
temp = []
for img_name in tqdm(train.filename):
    img = s3_funcs.read_image(key=DATA_PATH+'images/'+img_name)
    temp.append(img)

train_array = np.stack(temp)
train_array = train_array.reshape(len(train_array), -1, 784).astype(np.float32)

In [None]:
# validate if need to standardize pixels
train_array.min(), train_array.max()

In [None]:
# split data
train_size = 0.7
split_size = int(train_array.shape[0]*train_size)

x_train, x_val = train_array[:split_size], train_array[split_size:]

In [None]:
# model architecture
input_img = Input(shape=(784,))

# --- ENCODER --- #
encoded = Dense(2000, activation='relu')(input_img)
encoded = Dense(500, activation='relu')(encoded)
encoded = Dense(100, activation='relu')(encoded)
encoded = Dense(10, activation='linear')(encoded)

# --- DECODER --- #
decoded = Dense(100, activation='relu')(encoded)
decoded = Dense(500, activation='relu')(decoded)
# last layer must match input shape
decoded = Dense(784, activation='sigmoid')(decoded)

# make the model to map input to its reconstruction
autoencoder = Model(input_img, decoded)
encoder = Model(input_img, encoded)

# see summaries
print(autoencoder.summary())
print(encoder.summary())

In [None]:
# compile
# remember that we're not clasifying so loss is going to be a reg function
autoencoder.compile(optimzer='Adam', loss='mse')

In [None]:
# train
epochs = 100,
batch_size=256
autoencoder.fit(
    x_train,
    x_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(x_val, x_val)
)

In [None]:
# now check how it is actually performing
temp = autoencoder.predict(x_train)
plt.imshow(temp[0].reshape(28, 28), cmap='grey')

In [None]:
# essentially, the features that are useful for downstream steps
# are the features from the encoder (what the model really learn)
# which are the following:
temp = encoder.predict(x_train)
temp[0]

In [None]:
# so extract features from images
pred_auto_train = encoder.predict(x_train)
pred_auto_val = encoder.predict(x_val)

In [None]:
# define K-means
kmeans = KMeans(n_jobs=-2, n_clusters=10)
kmeans.fit(pred_auto_train)

# get clusters from val data
pred = kmeans.predict(pred_auto_val)

In [None]:
# visualize clusters
index = rng.choice(range(len(x_val)))

print("This image belongs to cluster: "+str(pred[index]))
plt.imshow(x_val[index].reshape(28,28), cmap='grey')

In [None]:
# so, to get a more accurate understanding of how well it is performing
# create a pd dataframe with labels and clusters
y = train.label.values
y_train, y_val = y[split_size:], y[:split_size]

# compare with actual values
temp = pd.DataFrame({"val_y": y_val, "cluster_name":pred})
temp[temp.cluster_name==1].head()

In [None]:
# overral model score
normalized_mutual_info_score(pred, val_y)