In [1]:
%matplotlib widget

import tensorflow as tf
import os
import sys
import cv2
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
import random

from tensorflow import keras
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.layers import GlobalMaxPooling2D
from numpy.linalg import norm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

In [2]:
DATASET_PATH = "/Users/jeremy/Google Drive/datasets/fashion-dataset/"
print(os.listdir(DATASET_PATH))

['embeddings.tsv', '.DS_Store', 'images.csv', 'images', 'styles', 'styles.csv', '.ipynb_checkpoints', 'embeddings.csv', 'resnet50-embeddings.pkl']


In [3]:
df = pd.read_csv(DATASET_PATH + "styles.csv", nrows=5000, error_bad_lines=False)
df['image'] = df.apply(lambda row: str(row['id']) + ".jpg", axis=1)
df = df.reset_index(drop=True)
df.head(10)

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName,image
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011,Casual,Turtle Check Men Navy Blue Shirt,15970.jpg
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012,Casual,Peter England Men Party Blue Jeans,39386.jpg
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016,Casual,Titan Women Silver Watch,59263.jpg
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011,Casual,Manchester United Men Solid Black Track Pants,21379.jpg
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012,Casual,Puma Men Grey T-shirt,53759.jpg
5,1855,Men,Apparel,Topwear,Tshirts,Grey,Summer,2011,Casual,Inkfruit Mens Chain Reaction T-shirt,1855.jpg
6,30805,Men,Apparel,Topwear,Shirts,Green,Summer,2012,Ethnic,Fabindia Men Striped Green Shirt,30805.jpg
7,26960,Women,Apparel,Topwear,Shirts,Purple,Summer,2012,Casual,Jealous 21 Women Purple Shirt,26960.jpg
8,29114,Men,Accessories,Socks,Socks,Navy Blue,Summer,2012,Casual,Puma Men Pack of 3 Socks,29114.jpg
9,30039,Men,Accessories,Watches,Watches,Black,Winter,2016,Casual,Skagen Men Black Watch,30039.jpg


In [7]:
# Load ResNet-50 model pretrained on Imagenet without the classifying layers on top.
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Add Global Max Pooling layer on top
model = keras.Sequential([
    base_model,
    GlobalMaxPooling2D()
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
resnet50 (Functional)        (None, 7, 7, 2048)        23587712  
_________________________________________________________________
global_max_pooling2d (Global (None, 2048)              0         
Total params: 23,587,712
Trainable params: 23,534,592
Non-trainable params: 53,120
_________________________________________________________________


In [4]:
def get_img_path(img):
  return DATASET_PATH + "images/" + img

In [5]:
def extract_embeddings(img_name, model):
  input_shape = (224, 224, 3)
  img = image.load_img(get_img_path(img_name), target_size=(input_shape[0], input_shape[1]))
  img_array = image.img_to_array(img)
  expanded_img_array = np.expand_dims(img_array, axis=0)
  preprocessed_img = preprocess_input(expanded_img_array)
  embeddings = model.predict(preprocessed_img)
  return embeddings.reshape(-1)

In [8]:
emb = extract_embeddings(df.iloc[0].image, model)
emb.shape

(2048,)

In [9]:
img_array = cv2.imread(get_img_path(df.iloc[282].image))
plt.imshow(cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB))
print(img_array.shape)
print(emb)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

(1440, 1080, 3)
[ 5.661311   2.366158   0.        ...  1.3138627  0.        10.820534 ]


In [None]:
# Create embeddings and store them into dataframe
df_sample      = df#.sample(10)
map_embeddings = df_sample['image'].apply(lambda img: extract_embeddings(img, model))
df_embs        = map_embeddings.apply(pd.Series)

In [None]:
# Serialize dataframe to pickle file
df_embs.to_pickle(DATASET_PATH + "resnet50-embeddings.pkl")

In [6]:
df_embs = pickle.load(open(DATASET_PATH + "resnet50-embeddings.pkl", "rb"))

In [7]:
# Perform PCA over the embeddings to reduce dimensionality before applying t-sne
num_feature_dimensions = 2  # Set the number of embedding dimensions
pca = PCA(n_components = num_feature_dimensions)
embs_compressed = pca.fit_transform(df_embs)
df_embs_compressed = pd.DataFrame(embs_compressed)

In [8]:
df_embs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,5.661316,2.366165,0.0,3.25573,0.845208,2.673841,8.78087,6.280786,0.787188,1.587657,...,2.46361,0.0,1.825994,20.485268,3.902863,0.0,8.225969,1.313863,0.0,10.820526
1,5.790417,9.593648,0.0,9.43167,0.287235,0.259945,17.028748,3.037453,0.136341,0.0,...,1.67057,0.013891,10.338681,4.784722,0.18914,0.0,7.739144,0.372693,0.0,8.903802
2,0.0,3.162254,0.484691,0.172299,2.609548,3.840852,3.426139,0.253253,1.098907,1.418282,...,0.434926,14.092834,3.018744,2.134506,2.549062,0.377385,6.180348,0.63662,14.018165,11.415048
3,5.882203,1.958606,0.0,12.519882,0.0,0.0,9.383594,3.781614,0.0,1.490617,...,17.825478,0.0,3.592507,2.042305,0.0,0.0,6.656779,1.457314,1.694204,1.403327
4,0.245968,17.38343,0.493824,3.374468,2.567368,0.0,6.482537,5.862639,0.0,3.164866,...,4.464024,0.447075,1.448143,12.884138,0.0,0.397501,5.028075,1.939697,0.0,11.4854


In [9]:
df_embs_compressed.head()

Unnamed: 0,0,1
0,-106.875221,-38.913704
1,-97.877441,-42.474339
2,24.955372,105.210243
3,-83.75325,-23.995729
4,-106.195869,-32.474777


In [10]:
# Check whether there are null values in data frame...
print(df[df.season.isnull()])

       id gender masterCategory subCategory   articleType baseColour season  \
282  5402  Women       Footwear       Shoes  Sports Shoes     Purple    NaN   

     year   usage                 productDisplayName     image  
282  2011  Sports  Kalenji Ekiden 200 Wn Purple 2011  5402.jpg  


In [11]:
# ...and override if there are any,
df.at[282, 'season'] = "Summer"

In [12]:
# Application of SVM to create the axis for a given feature
X = df_embs
y = df["season"]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
svm_clf = LinearSVC(C=1, max_iter=100000)
svm_clf.fit(X_scaled, y)

LinearSVC(C=1, max_iter=100000)

In [75]:
dir(svm_clf)

['C',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_check_n_features',
 '_estimator_type',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_predict_proba_lr',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_validate_data',
 'class_weight',
 'classes_',
 'coef_',
 'decision_function',
 'densify',
 'dual',
 'fit',
 'fit_intercept',
 'get_params',
 'intercept_',
 'intercept_scaling',
 'loss',
 'max_iter',
 'multi_class',
 'n_features_in_',
 'n_iter_',
 'penalty',
 'predict',
 'random_state',
 'score',
 'set_params',
 'sparsify',
 'tol',
 'verbose']

In [48]:
df['x'] = df_embs_compressed[0]
df['y'] = df_embs_compressed[1]

In [74]:
w = svm_clf.coef_[0]
b = svm_clf.intercept_[0]
xx = np.linspace(-15, 15)
yy = -w[0]/w[1] * xx - b/w[1]

xx.shape, yy.shape

((50,), (50,))

In [73]:
db = np.array((xx, yy))
db_pca = pca.inverse_transform(xx)
db_pca

ValueError: shapes (2048,) and (2,2048) not aligned: 2048 (dim 0) != 2 (dim 0)

In [39]:
def create_user_marker(x, y):
    """Creates and returns a user marker on a specific position"""
    user_marker = plt.plot(x, y, 'yo', markersize=10)
    user_position = np.array([x, y])
    return user_marker, user_position

In [40]:
def get_updated_user_pos(change):
    """
    Computes the index of the element that is closest to the given change value in x0.
    Returns the new user position.
    """
    index = np.where(np.isclose(neg_decision_boundary, min(neg_decision_boundary, key=lambda x:abs(x-change))))
    return neg_decision_boundary[index[0][0]], x0[index[0][0]]

In [59]:
def update_user_position(change):
    """Update the user position after the slider value has changed"""
    new_x, new_y = get_updated_user_pos(change.new)
    user_marker[0].set_data(new_x, new_y)
    new_user_pos = np.array([new_x, new_y])
    nearest_neighbour, nearest_neighbour_pos = get_nearest_neighbour(new_user_pos, df)
    annotate_nearest_neighbour(nearest_neighbour, nearest_neighbour_pos, ax, df)
    plt.title('Nearest Embedding: {} with season: {}'.format(nearest_neighbour, df.loc[df['id'] == nearest_neighbour].season.values[0]))
    fig.canvas.draw()
    fig.canvas.flush_events()

In [43]:
def calc_svm_decision_boundary(svm_clf, xmin, xmax):
    """Compute a decision boundary and ret"""
    w = svm_clf.coef_[0]
    b = svm_clf.intercept_[0]
    xx = np.linspace(xmin, xmax, 2048)
    yy = -w[0]/w[1] * xx - b/w[1]
    return yy, xx

In [44]:
def get_nearest_neighbour(user_position, df):
   nearest_neighbour = None
   nearest_neighbour_pos = None
   smallest_dist = sys.maxsize
   for row in df.itertuples():
      embedding_position = np.array([row.x, row.y])
      dist = norm(user_position - embedding_position)
      if dist < smallest_dist:
         smallest_dist = dist
         nearest_neighbour = row.id
         nearest_neighbour_pos = embedding_position
   return nearest_neighbour, nearest_neighbour_pos

In [45]:
def highlight_nearest_neighbour(id, df):
    x_nn = df.loc[df['id'] == id].x.values[0]
    y_nn = df.loc[df['id'] == id].y.values[0]
    plt.scatter(x=x_nn, y=y_nn, color='r')

In [46]:
def annotate_nearest_neighbour(nearest_neighbour, nearest_neighbour_pos, ax, df):
    if ax.artists != []:
        ax.artists[0].remove()
    arr_img = plt.imread(get_img_path(df.loc[df['id'] == nearest_neighbour].image.values[0]))
    imagebox = OffsetImage(arr_img, zoom=0.025)
    imagebox.image.axes = ax
    ab = AnnotationBbox(imagebox, nearest_neighbour_pos, xybox=(-20, 40), xycoords='data', boxcoords="offset points", arrowprops=dict(arrowstyle="->"))
    ax.add_artist(ab)

In [57]:
df.loc[df['id'] == 45097]['season'].values[0]

'Winter'

In [60]:
from ipywidgets import AppLayout, FloatSlider
from matplotlib.offsetbox import (AnnotationBbox, OffsetImage, TextArea)

plt.ioff()

fig, ax = plt.subplots(figsize=(12,7))
fig.canvas.header_visible = False
fig.canvas.layout.min_height = '400px'
sns.scatterplot(x="x", y="y",
                  hue="masterCategory", # can be replaced with `subCategory` to see more details
                  data=df,
                  legend="full",
                  alpha=0.8)

decision_boundary, x0 = calc_svm_decision_boundary(svm_clf, -150, 150)
neg_decision_boundary = np.negative(decision_boundary)
plt.plot(x0, decision_boundary, "k-", linewidth=2)
plt.plot(neg_decision_boundary, x0, "k-", linewidth=2)

rand_idx = random.choice(range(len(neg_decision_boundary)))
x = neg_decision_boundary[rand_idx]
y = x0[rand_idx]
user_marker, user_positon = create_user_marker(x, y)

nearest_neighbour, nearest_neighbour_pos = get_nearest_neighbour(user_positon, df)
annotate_nearest_neighbour(nearest_neighbour, nearest_neighbour_pos, ax, df)

plt.title('Nearest Embedding: {} with season: {}'.format(nearest_neighbour, df.loc[df['id'] == nearest_neighbour].season.values[0]))

# Create Slider to interact with the plot
slider = FloatSlider(
    orientation="horizontal",
    description="Position:",
    value=user_positon[0],
    min=min(neg_decision_boundary),
    max=max(neg_decision_boundary)
)
slider.layout.margin = '0px 30% 0px 30%'
slider.layout.width = '40%'

slider.observe(update_user_position, names='value')

AppLayout(
    center=fig.canvas,
    footer=slider,
    pane_heights=[0, 6, 1]
)

AppLayout(children=(FloatSlider(value=7.148135768279079, description='Position:', layout=Layout(grid_area='foo…