In [1]:
%matplotlib widget

import tensorflow as tf
import os
import sys
import cv2
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
import random

from tensorflow import keras
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.layers import GlobalMaxPooling2D
from numpy.linalg import norm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from annoy import AnnoyIndex

In [2]:
DATASET_PATH = "/Users/jeremy/Google Drive/datasets/fashion-dataset/"
print(os.listdir(DATASET_PATH))

['embeddings.tsv', '.DS_Store', 'images.csv', 'images', 'resnet50-embeddings-full.pkl', 'styles_preprocessed.csv', 'styles', 'styles.csv', '.ipynb_checkpoints', 'embeddings.csv', 'resnet50-embeddings.pkl']


In [3]:
df = pd.read_csv(DATASET_PATH + "styles_preprocessed.csv", error_bad_lines=False)
df['image'] = df.apply(lambda row: str(row['id']) + ".jpg", axis=1)
df = df.reset_index(drop=True)
df.head(10)

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName,image
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011,Casual,Turtle Check Men Navy Blue Shirt,15970.jpg
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012,Casual,Peter England Men Party Blue Jeans,39386.jpg
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016,Casual,Titan Women Silver Watch,59263.jpg
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011,Casual,Manchester United Men Solid Black Track Pants,21379.jpg
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012,Casual,Puma Men Grey T-shirt,53759.jpg
5,1855,Men,Apparel,Topwear,Tshirts,Grey,Summer,2011,Casual,Inkfruit Mens Chain Reaction T-shirt,1855.jpg
6,30805,Men,Apparel,Topwear,Shirts,Green,Summer,2012,Ethnic,Fabindia Men Striped Green Shirt,30805.jpg
7,26960,Women,Apparel,Topwear,Shirts,Purple,Summer,2012,Casual,Jealous 21 Women Purple Shirt,26960.jpg
8,29114,Men,Accessories,Socks,Socks,Navy Blue,Summer,2012,Casual,Puma Men Pack of 3 Socks,29114.jpg
9,30039,Men,Accessories,Watches,Watches,Black,Winter,2016,Casual,Skagen Men Black Watch,30039.jpg


In [4]:
def get_img_path(img):
  return DATASET_PATH + "images/" + img

In [5]:
df_embs = pickle.load(open(DATASET_PATH + "resnet50-embeddings-full.pkl", "rb"))

In [6]:
# Filter only embeddings with master category 'Apparel' and save indices to filter for them later
df_filtered = df.loc[df.masterCategory == 'Apparel']
df_filtered_idx = df_filtered.index.values.tolist()
df_filtered = df_filtered.reset_index(drop=True)
df_filtered.masterCategory.unique(), df_filtered.shape, len(df_filtered_idx)

(array(['Apparel'], dtype=object), (21393, 11), 21393)

In [7]:
embs_filtered = []
for i, row in df_embs.iterrows():
    if i in  df_filtered_idx:
        embs_filtered.append(row)
df_embs_filtered = pd.DataFrame(embs_filtered)
df_embs_filtered = df_embs_filtered.reset_index(drop=True)
df_embs_filtered

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,5.661311,2.366158,0.000000,3.255738,0.845207,2.673839,8.780884,6.280785,0.787186,1.587661,...,2.463607,0.000000,1.825991,20.485271,3.902871,0.000000,8.225970,1.313863,0.000000,10.820534
1,5.790412,9.593652,0.000000,9.431687,0.287238,0.259950,17.028755,3.037456,0.136339,0.000000,...,1.670569,0.013894,10.338679,4.784732,0.189142,0.000000,7.739138,0.372693,0.000000,8.903797
2,5.882204,1.958622,0.000000,12.519884,0.000000,0.000000,9.383584,3.781623,0.000000,1.490620,...,17.825481,0.000000,3.592513,2.042306,0.000000,0.000000,6.656784,1.457313,1.694215,1.403330
3,0.245961,17.383436,0.493822,3.374461,2.567365,0.000000,6.482548,5.862635,0.000000,3.164864,...,4.464021,0.447082,1.448132,12.884141,0.000000,0.397487,5.028076,1.939697,0.000000,11.485396
4,0.028540,18.567101,0.000000,0.757250,1.504954,0.000000,1.982043,2.963786,1.072763,4.539298,...,3.247076,4.273134,3.088639,16.997049,0.000000,0.000000,2.258092,1.595798,0.000000,7.674057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21388,0.000000,35.020271,7.460171,1.012842,1.366380,2.258118,1.922700,1.341036,1.293422,0.000000,...,4.813864,2.418916,1.957278,0.703634,6.915268,0.523176,4.877475,0.000000,0.000000,14.361758
21389,2.841607,15.895623,1.662815,2.716725,0.000000,0.727688,1.496747,0.472215,0.905088,3.890155,...,6.330880,2.010111,3.669139,13.017206,0.000000,0.749803,4.271129,1.611024,0.000000,9.624297
21390,3.243481,16.985628,1.560327,2.596826,0.545848,0.000000,9.023268,0.919529,1.380090,3.811848,...,13.571547,2.010174,1.121704,18.124388,1.862831,0.000000,1.212624,2.394026,3.093368,25.842484
21391,3.075987,6.170718,0.000000,0.242754,6.728174,0.000000,3.616792,1.160508,0.088385,1.244293,...,4.731680,1.630670,0.743359,15.961621,9.004664,0.551970,1.509514,0.000000,1.935909,13.389180


In [22]:
# Perform PCA over the embeddings to reduce dimensionality
num_feature_dimensions = 300  # Set the number of embedding dimensions
pca = PCA(n_components = num_feature_dimensions)
embs_compressed = pca.fit_transform(df_embs_filtered)
df_embs_filtered_compressed = pd.DataFrame(embs_compressed)
df_embs_filtered_compressed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-64.357910,2.009969,4.515368,48.357891,-19.128445,5.342452,48.397087,-27.092453,-30.359943,19.001760,...,-0.027479,-1.793284,-1.043051,-1.388653,-0.346039,2.874086,-0.520509,-0.245018,-0.169346,0.081849
1,-52.114098,-68.620903,-57.570988,-31.832296,53.722683,-23.362713,-2.748619,-17.558195,-21.904865,6.952492,...,4.537742,2.129923,1.952338,0.372517,1.476021,3.885345,-1.630003,-1.318896,0.166479,-2.227098
2,-22.425611,-59.430878,-64.441772,-27.177677,47.944099,-31.331270,-1.548168,8.081011,-4.350835,-14.550479,...,-2.090356,-3.501975,3.200779,4.224617,-4.884243,-1.513306,-1.040210,1.081879,-1.257495,0.841683
3,-54.588432,-30.280878,21.072157,-0.946770,-21.051865,-9.433448,-18.621870,3.578182,-16.154633,19.667513,...,0.119156,-1.052037,-1.247684,-0.772350,6.533217,1.171853,-1.329105,0.671467,-0.957433,-2.931145
4,-28.993326,-14.922001,54.522846,-20.936012,-14.654531,-7.483351,17.664167,19.237654,-10.022154,-5.284721,...,-0.574849,-2.741356,-0.488036,0.485247,-0.606040,-0.227766,-4.865824,0.322001,-5.071671,1.705147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21388,106.352798,-50.901821,48.033264,14.035495,4.876539,-4.716382,-13.125351,-2.849083,-0.469377,-25.319630,...,3.201861,2.929503,2.313374,1.802102,0.520290,-4.872674,-0.329477,2.195545,0.210869,3.807374
21389,-37.274292,-15.107414,56.033501,-37.821644,7.631700,-12.318003,25.639122,-17.738274,14.464388,-2.579258,...,-0.201064,2.645311,-2.670207,-0.987432,-0.468722,3.742406,0.085996,3.795931,0.432865,5.938803
21390,-33.474800,26.566454,21.142603,-23.724031,21.568907,-24.361326,-19.552706,-66.617790,18.290022,-10.338690,...,2.189024,-0.568473,1.852798,1.372950,0.117880,-0.166211,-5.467371,-4.452588,2.948454,5.682208
21391,1.716179,63.075298,7.019126,-0.876224,25.028749,17.429161,22.952684,-33.428207,-2.201403,8.034510,...,-1.528930,-11.931191,-0.628966,-0.998691,-3.618734,-3.809790,-2.899342,-6.483532,6.194696,-6.653462


In [23]:
# Application of SVM to create the axis for a given feature
X = df_embs_filtered_compressed
y = df_filtered["season"]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
svm_clf = LinearSVC(C=1, max_iter=10000000)
svm_clf.fit(X_scaled, y)

LinearSVC(C=1, max_iter=10000000)

In [24]:
number_of_embeddings = df_filtered.shape[0]

f = 300
t = AnnoyIndex(f, 'euclidean')
for index, row in df_embs_filtered_compressed.iterrows():
    v = row.iloc[row.index].values.tolist()
    t.add_item(index, v)

t.build(10)
#t.save('annoy_index_1000d.ann')

True

In [25]:
def get_nearest_neighbour(user_position, df):
   i = t.get_nns_by_vector(user_position, 1)
   nearest_neighbour = df.iloc[i]['id'].values[0]
   return nearest_neighbour

In [26]:
def annotate_nearest_neighbour(nearest_neighbour, df):
    arr_img = plt.imread(get_img_path(df.loc[df['id'] == nearest_neighbour].image.values[0]))
    plt.imshow(arr_img)

In [27]:
def compute_navigation_axis(emb, w, dist):
    nav_axis = []
    for i, feature in enumerate(emb):
        feature_val = emb[feature].values[0]
        step = w[i]
        axis = [feature_val]
        pos_val = feature_val
        neg_val = feature_val
        for i in range(dist):
            pos_val = pos_val + step
            neg_val = neg_val - step
            axis.append(pos_val)
            axis.append(neg_val)
        nav_axis.append(sorted(axis))
    return np.array(nav_axis)

In [28]:
def initialize_user_position(axis, idx):
    user_position = []
    for dim in axis:
        user_position.append(dim[idx])
    return user_position

In [29]:
def update_user_position(change):
    """Update the user position after the slider value has changed"""
    idx = np.where(np.isclose(nav_axis[0], min(nav_axis[0], key=lambda x:abs(x-change.new))))[0][0]
    new_user_pos = []
    for i in range(len(nav_axis)):
        new_user_pos.append(nav_axis[i][idx])
    user_position = new_user_pos
    nearest_neighbour = get_nearest_neighbour(user_position, df_filtered)
    annotate_nearest_neighbour(nearest_neighbour, df_filtered)
    plt.title('Nearest Embedding: {} with season: {}'.format(nearest_neighbour, df_filtered.loc[df_filtered['id'] == nearest_neighbour].season.values[0]))
    fig.canvas.draw()
    fig.canvas.flush_events()

In [50]:
from ipywidgets import AppLayout, FloatSlider

# Create matplotlib figure for displaying fashion items
fig = plt.figure(figsize=(7, 7))

# Create orthogonal navigation axis
w = svm_clf.coef_[0]
rand_emb = df_embs_filtered_compressed.sample()
nav_axis = compute_navigation_axis(rand_emb, w, 500)

starting_idx = np.where(np.isclose(nav_axis[0], min(nav_axis[0], key=lambda x:abs(x-rand_emb[0].values[0]))))[0][0]
user_position = initialize_user_position(nav_axis, starting_idx)

nearest_neighbour = get_nearest_neighbour(user_position, df_filtered)
annotate_nearest_neighbour(nearest_neighbour, df_filtered)

plt.title('Nearest Embedding: {} with season: {}, user_pos:'.format(nearest_neighbour, df_filtered.loc[df_filtered['id'] == nearest_neighbour].season.values[0], user_position[0]))

# Create Slider to navigate in embedding space
slider = FloatSlider(
    orientation="horizontal",
    description="x-Position:",
    value=user_position[0],
    min=min(nav_axis[0]),
    max=max(nav_axis[0])
)
slider.layout.margin = '0px 10% 0px 10%'
slider.layout.width = '40%'

slider.observe(update_user_position, names='value')

display(slider)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

FloatSlider(value=33.42757797241211, description='x-Position:', layout=Layout(margin='0px 10% 0px 10%', width=…