In [None]:
import numpy as np
import pandas as pd
import matplotlib.patheffects as PathEffects
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import os
import requests
import time

import mnist_reader

In [None]:
# Download fashion mnist dataset
urls = [
    'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz',
    'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz',
    'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz',
    'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz',
]
dataset_directory = './data/fashion'

os.makedirs(dataset_directory, exist_ok=True)

for i, url in enumerate(urls):
    filename = os.path.join(dataset_directory, os.path.basename(url))
    r = requests.get(url)
    with open(filename, 'wb') as f:
        f.write(r.content)
        print(f'Downloaded file [{i + 1}/{len(urls)}]: {filename}')  

In [None]:
# Load datasets
X_train, y_train = mnist_reader.load_mnist('data/fashion', kind='train')
X_test, y_test = mnist_reader.load_mnist('data/fashion', kind='t10k')

print(f'shape of training data: {X_train.shape}')
print(f'label values: {y_train}')

In [None]:
# Apply seaborn settings
sns.set_style('darkgrid')
sns.set_palette('muted')
sns.set_context("notebook", font_scale=1.5,
                rc={"lines.linewidth": 2.5})

# Pick random state for reproducible t-SNE results
RS = 123

In [None]:
def fashion_scatter(x, colors):
    # choose a color palette with seaborn
    num_classes = len(np.unique(colors))
    palette = np.array(sns.color_palette("hls", num_classes))

    # create a scatter plot
    f = plt.figure(figsize=(8, 8))
    ax = plt.subplot(aspect='equal')
    sc = ax.scatter(x[:,0], x[:,1], lw=0, s=40, c=palette[colors.astype(int)])
    plt.xlim(-25, 25)
    plt.ylim(-25, 25)
    ax.axis('off')
    ax.axis('tight')

    # add the labels for each digit corresponding to the label
    txts = []

    for i in range(num_classes):

        # Position of each label at median of data points

        xtext, ytext = np.median(x[colors == i, :], axis=0)
        txt = ax.text(xtext, ytext, str(i), fontsize=24)
        txt.set_path_effects([
            PathEffects.Stroke(linewidth=5, foreground="w"),
            PathEffects.Normal()])
        txts.append(txt)

    return f, ax, sc, txts

In [None]:
# Subset first 20k data points to visualize
x_subset = X_train[0:20000]
y_subset = y_train[0:20000]

print(np.unique(y_subset))

In [None]:
# Run PCA
time_start = time.time()

pca = PCA(n_components=4)
pca_result = pca.fit_transform(x_subset)

print('PCA done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [None]:
# Place PCA results into a dataframe
pca_df = pd.DataFrame(columns = ['pca1','pca2','pca3','pca4'])

pca_df['pca1'] = pca_result[:,0]
pca_df['pca2'] = pca_result[:,1]
pca_df['pca3'] = pca_result[:,2]
pca_df['pca4'] = pca_result[:,3]

print('Variance explained per principal component: {}'.format(pca.explained_variance_ratio_))

In [None]:
# Pick first two principal components
top_two_comp = pca_df[['pca1','pca2']]

# Visualize PCA results
fashion_scatter(top_two_comp.values,y_subset)

In [None]:
# Run t-SNE (takes 1.5 minutes)
time_start = time.time()

fashion_tsne = TSNE(random_state=RS, n_components=2).fit_transform(x_subset)

print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [None]:
# Visualize t-SNE results
fashion_scatter(fashion_tsne, y_subset)

In [None]:
# Run PCA as a precusor step to a faster, more approximate t-SNE
time_start = time.time()

pca_50 = PCA(n_components=50)
pca_result_50 = pca_50.fit_transform(x_subset)

print('PCA with 50 components done! Time elapsed: {} seconds'.format(time.time()-time_start))

print('Cumulative variance explained by 50 principal components: {}'.format(np.sum(pca_50.explained_variance_ratio_)))

In [None]:
# Run t-SNE on PCA results (takes 1.25 minutes)
time_start = time.time()

fashion_pca_tsne = TSNE(random_state=RS).fit_transform(pca_result_50)

print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [None]:
# Visualize PCA/t-SNE results
fashion_scatter(fashion_pca_tsne, y_subset)