## Create TensorBoard Image Embedding Projector

In [1]:
import os,cv2
import numpy as np
import pandas as pd
import requests
import shutil
import matplotlib.pyplot as plt
import pickle
import tensorflow as tf
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.cluster import MeanShift
from sklearn.preprocessing import MinMaxScaler
from tensorflow.contrib.tensorboard.plugins import projector
tf.__version__

  from ._conv import register_converters as _register_converters


'1.15.0'

### Load Feature data

In [2]:
# Get image urls and save as dictionary. {asin: image_url}
imgUrl_df = pd.read_csv('./Results/asin_url_for_50k.csv')
imgUrl_dict = pd.Series(imgUrl_df.url.values,index=imgUrl_df.asin).to_dict()

In [3]:
# Load features.
feature_vectors = pd.read_csv('./Results/features_50k.csv')
feature_vectors = feature_vectors.rename(columns={"Unnamed: 0": "product_id"})
feature_vectors.set_index('product_id',inplace=True)

In [4]:
print ("feature_vectors_shape:",feature_vectors.shape)
print ("num of images:",feature_vectors.shape[0])
print ("size of individual feature vector:",feature_vectors.shape[1])

feature_vectors_shape: (4249, 4096)
num of images: 4249
size of individual feature vector: 4096


### Clustering (15 clusters)

In [5]:
# MinMasScaler 
mms = MinMaxScaler()
mms.fit(feature_vectors)
feature_transformed = mms.transform(feature_vectors)

# Kmeans
km = KMeans(n_clusters=15, random_state=0)
km = km.fit(feature_transformed)

In [6]:
label = np.array(km.labels_)
label.shape

(4249,)

In [7]:
# Check products in each cluster
(unique, counts) = np.unique(label, return_counts=True)
frequencies = np.asarray((unique, counts)).T

print(frequencies)

[[  0 166]
 [  1 214]
 [  2 332]
 [  3 223]
 [  4 264]
 [  5 317]
 [  6 302]
 [  7 302]
 [  8 239]
 [  9 122]
 [ 10 489]
 [ 11 435]
 [ 12 264]
 [ 13 299]
 [ 14 281]]


In [8]:
np.savetxt('./Results/Cluster_result_15.txt', label, delimiter= ',')

In [9]:
feature_label_df = feature_vectors
feature_label_df['cluster']=label

In [10]:
feature_label_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,4087,4088,4089,4090,4091,4092,4093,4094,4095,cluster
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B008JFDFIW,0.0,0.0,1.847189,2.64175,0.0,0.0,0.0,6.623529,0.0,0.0,...,0.0,0.0,1.501545,0.0,0.0,0.0,0.0,0.0,1.291022,11
B00IM5FCOY,0.0,0.0,0.164301,0.0,0.0,1.67307,0.0,0.0,0.0,6.619954,...,0.0,0.0,0.0,0.0,5.276841,0.0,0.0,0.0,0.0,7
B00AAHNZUU,0.0,0.0,0.0,0.855917,0.0,0.0,0.0,0.0,0.0,3.871443,...,0.0,0.0,0.0,1.8203,4.572748,0.0,0.0,0.0,0.0,13
B004J4WWK4,0.0,0.0,0.0,0.0,0.0,3.584921,0.0,0.0,0.0,1.201754,...,0.0,0.0,0.0,0.0,6.192037,0.0,0.0,0.0,0.0,5
B000BIUAYW,0.0,0.0,0.0,4.482935,0.0,0.0,2.517327,0.0,0.0,3.51205,...,0.0,0.0,3.508799,0.0,0.0,0.0,0.0,0.0,1.227973,13


### Sampling 

In [11]:
frames = []
for i in range(15):
    df = feature_label_df[feature_label_df['cluster']==i].sample(n=30)
    frames.append(df)

In [12]:
sample_feature_label_df = pd.concat(frames)
sample_feature_label_df.shape

(450, 4097)

In [13]:
sample_feature_label_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,4087,4088,4089,4090,4091,4092,4093,4094,4095,cluster
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B00BMVL3FM,0.0,0.0,0.0,0.0,0.0,1.772492,0.0,1.954608,0.0,6.637149,...,0.0,0.0,0.180952,0.0,0.0,0.0,0.0,0.0,0.0,0
B0007WDF7G,0.0,0.0,0.0,0.0,0.0,2.186482,1.742248,0.0,0.0,2.113785,...,0.0,0.0,0.0,0.0,8.983343,0.0,0.0,0.0,0.0,0
B00IFVL0Q4,0.0,0.0,0.484371,0.0,0.0,1.373306,0.494256,0.0,0.0,0.0,...,0.367028,0.0,0.0,0.0,5.257505,0.0,0.0,0.0,0.0,0
B008OV4IUA,0.0,0.0,0.0,0.0,0.0,1.079173,0.0,0.0,0.0,0.0,...,0.0,0.0,0.442839,0.0,0.0,0.0,0.0,0.0,0.0,0
B00F0IMQYU,0.0,0.0,0.0,0.0,0.0,0.361643,0.101608,0.0,0.0,2.26984,...,0.0,0.0,0.0,0.0,4.591999,0.0,0.719977,1.058692,0.0,0


In [14]:
products = list(sample_feature_label_df.index)

In [25]:
Image_Path = './Sample_Images_for_tensorboard/Images_50K_sample_450/'
# Get images for samples. 
for asin in products:
    
    url = imgUrl_dict[asin]
    # Open the url image, set stream to True, this will return the stream content.
    resp = requests.get(url, stream=True)
    # Open a local file with wb ( write binary ) permission. (created increment filename)
    path = os.path.join(Image_Path, asin + "." + "jpg")
#     print(path)
    local_file = open(path, 'wb')
    # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
    resp.raw.decode_content = True
    # Copy the response stream raw data to local image file.
    shutil.copyfileobj(resp.raw, local_file)
    # Remove the image url response object.
    del resp
    

In [26]:
# sample_feature_label_df.iloc[:,:-1]

### Get features & labels

In [27]:
features = tf.Variable(sample_feature_label_df.iloc[:,:-1], name='features')

In [28]:
y = sample_feature_label_df.loc[:,'cluster'].values
len(y)

450

### Visualize - TensorBoard 

#### - create a log directory.

In [29]:
PATH = os.getcwd()

LOG_DIR = PATH+ '/embedding-logs'
#metadata = os.path.join(LOG_DIR, 'metadata2.tsv')

#%%
data_path = PATH + '/Sample_Images_for_tensorboard/Images_50K_sample_450/'
data_dir_list = os.listdir(data_path)
print("number of images:",len(data_dir_list))

number of images: 450


In [30]:
# Resize images
img_data=[]
for img in data_dir_list:
    
#     img_list=os.listdir(data_path+'/'+ dataset)
#     print ('Loaded the images of dataset-'+'{}\n'.format(dataset))
#     print(img_list)
#     for img in img_list:

    input_img=cv2.imread(data_path + str(img) )
    input_img_resize=cv2.resize(input_img,(224,224))
    img_data.append(input_img_resize)
    
                
img_data = np.array(img_data)

In [31]:
print("number of images:",len(img_data))

number of images: 450


In [32]:
print("shape of individual image data:", img_data[1].shape)

shape of individual image data: (224, 224, 3)


In [35]:
num_of_samples=sample_feature_label_df.shape[0]
# num_of_samples_each_class = 100

In [36]:
num_of_samples

450

In [37]:
names = ['cluster0','cluster1','cluster2','cluster3','cluster4',
         'cluster5','cluster6','cluster7','cluster8','cluster9',
         'cluster10','cluster11','cluster12','cluster13','cluster14']

#with open(metadata, 'w') as metadata_file:
#    for row in range(210):
#        c = y[row]
#        metadata_file.write('{}\n'.format(c))

metadata_file = open(os.path.join(LOG_DIR, 'metadata_4_classes.tsv'), 'w')
metadata_file.write('Class\tName\n')
k=20 # num of samples in each class
j=0
#for i in range(210):
#    metadata_file.write('%06d\t%s\n' % (i, names[y[i]]))
for i in range(num_of_samples):
        c = names[y[i]]
        if i%k==0:
            j=j+1
        metadata_file.write('{}\t{}\n'.format(j,c))
        #metadata_file.write('%06d\t%s\n' % (j, c))
metadata_file.close()

In [38]:
# Taken from: https://github.com/tensorflow/tensorflow/issues/6322
def images_to_sprite(data):
    """Creates the sprite image along with any necessary padding

    Args:
      data: NxHxW[x3] tensor containing the images.

    Returns:
      data: Properly shaped HxWx3 image with any necessary padding.
    """
    if len(data.shape) == 3:
        data = np.tile(data[...,np.newaxis], (1,1,1,3))
    data = data.astype(np.float32)
    min = np.min(data.reshape((data.shape[0], -1)), axis=1)
    data = (data.transpose(1,2,3,0) - min).transpose(3,0,1,2)
    max = np.max(data.reshape((data.shape[0], -1)), axis=1)
    data = (data.transpose(1,2,3,0) / max).transpose(3,0,1,2)
    # Inverting the colors seems to look better for MNIST
    #data = 1 - data

    n = int(np.ceil(np.sqrt(data.shape[0])))
    padding = ((0, n ** 2 - data.shape[0]), (0, 0),
            (0, 0)) + ((0, 0),) * (data.ndim - 3)
    data = np.pad(data, padding, mode='constant',
            constant_values=0)
    # Tile the individual thumbnails into an image.
    data = data.reshape((n, n) + data.shape[1:]).transpose((0, 2, 1, 3)
            + tuple(range(4, data.ndim + 1)))
    data = data.reshape((n * data.shape[1], n * data.shape[3]) + data.shape[4:])
    data = (data * 255).astype(np.uint8)
    return data

In [39]:
sprite = images_to_sprite(img_data)
cv2.imwrite(os.path.join(LOG_DIR, 'sprite_4_classes.png'), sprite)
#scipy.misc.imsave(os.path.join(LOG_DIR, 'sprite.png'), sprite)



True

In [40]:
with tf.Session() as sess:
    saver = tf.train.Saver([features])

    sess.run(features.initializer)
    saver.save(sess, os.path.join(LOG_DIR, 'images_4_classes.ckpt'))
    
    config = projector.ProjectorConfig()
    # One can add multiple embeddings.
    embedding = config.embeddings.add()
    embedding.tensor_name = features.name
    # Link this tensor to its metadata file (e.g. labels).
    embedding.metadata_path = os.path.join(LOG_DIR, 'metadata_4_classes.tsv')
    # Comment out if you don't want sprites
    embedding.sprite.image_path = os.path.join(LOG_DIR, 'sprite_4_classes.png')
    embedding.sprite.single_image_dim.extend([img_data.shape[1], img_data.shape[1]])
    # Saves a config file that TensorBoard will read during startup.
    projector.visualize_embeddings(tf.summary.FileWriter(LOG_DIR), config)



In [None]:
tensorboard --logdir=embedding-logs