In [1]:
%matplotlib widget

import tensorflow as tf
import os
import sys
import cv2
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns

from tensorflow import keras
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.layers import GlobalMaxPooling2D
from numpy.linalg import norm
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

In [3]:
DATASET_PATH = "/Users/jeremy/Google Drive/datasets/fashion-dataset/"
print(os.listdir(DATASET_PATH))

['embeddings.tsv', '.DS_Store', 'images.csv', 'images', 'styles', 'styles.csv', '.ipynb_checkpoints', 'embeddings.csv', 'resnet50-embeddings.pkl']


In [4]:
df = pd.read_csv(DATASET_PATH + "styles.csv", nrows=5000, error_bad_lines=False)
df['image'] = df.apply(lambda row: str(row['id']) + ".jpg", axis=1)
df = df.reset_index(drop=True)
df.head(10)

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName,image
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011,Casual,Turtle Check Men Navy Blue Shirt,15970.jpg
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012,Casual,Peter England Men Party Blue Jeans,39386.jpg
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016,Casual,Titan Women Silver Watch,59263.jpg
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011,Casual,Manchester United Men Solid Black Track Pants,21379.jpg
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012,Casual,Puma Men Grey T-shirt,53759.jpg
5,1855,Men,Apparel,Topwear,Tshirts,Grey,Summer,2011,Casual,Inkfruit Mens Chain Reaction T-shirt,1855.jpg
6,30805,Men,Apparel,Topwear,Shirts,Green,Summer,2012,Ethnic,Fabindia Men Striped Green Shirt,30805.jpg
7,26960,Women,Apparel,Topwear,Shirts,Purple,Summer,2012,Casual,Jealous 21 Women Purple Shirt,26960.jpg
8,29114,Men,Accessories,Socks,Socks,Navy Blue,Summer,2012,Casual,Puma Men Pack of 3 Socks,29114.jpg
9,30039,Men,Accessories,Watches,Watches,Black,Winter,2016,Casual,Skagen Men Black Watch,30039.jpg


In [4]:
# Load ResNet-50 model pretrained on Imagenet without the classifying layers on top.
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Add Global Max Pooling layer on top
model = keras.Sequential([
    base_model,
    GlobalMaxPooling2D()
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
resnet50 (Functional)        (None, 7, 7, 2048)        23587712  
_________________________________________________________________
global_max_pooling2d (Global (None, 2048)              0         
Total params: 23,587,712
Trainable params: 23,534,592
Non-trainable params: 53,120
_________________________________________________________________


In [5]:
def get_img_path(img):
  return DATASET_PATH + "images/" + img

In [6]:
def extract_embeddings(img_name, model):
  input_shape = (224, 224, 3)
  img = image.load_img(get_img_path(img_name), target_size=(input_shape[0], input_shape[1]))
  img_array = image.img_to_array(img)
  expanded_img_array = np.expand_dims(img_array, axis=0)
  preprocessed_img = preprocess_input(expanded_img_array)
  embeddings = model.predict(preprocessed_img)
  return embeddings.reshape(-1)

In [7]:
emb = extract_embeddings(df.iloc[0].image, model)
emb.shape

(2048,)

In [84]:
img_array = cv2.imread(get_img_path(df.iloc[282].image))
plt.imshow(cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB))
print(img_array.shape)
print(emb)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

(1440, 1080, 3)


NameError: name 'emb' is not defined

In [None]:
# Create embeddings and store them into dataframe
df_sample      = df#.sample(10)
map_embeddings = df_sample['image'].apply(lambda img: extract_embeddings(img, model))
df_embs        = map_embeddings.apply(pd.Series)

In [None]:
# Serialize dataframe to pickle file
df_embs.to_pickle(DATASET_PATH + "resnet50-embeddings.pkl")

In [7]:
df_embs = pickle.load(open(DATASET_PATH + "resnet50-embeddings.pkl", "rb"))

In [8]:
# Perform PCA over the embeddings to reduce dimensionality before applying t-sne
num_feature_dimensions = 100  # Set the number of embedding dimensions
pca = PCA(n_components = num_feature_dimensions)
pca.fit(df_embs)
embs_compressed = pca.transform(df_embs)
df_embs_compressed = pd.DataFrame(embs_compressed)

In [9]:
df_embs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,5.661316,2.366165,0.0,3.25573,0.845208,2.673841,8.78087,6.280786,0.787188,1.587657,...,2.46361,0.0,1.825994,20.485268,3.902863,0.0,8.225969,1.313863,0.0,10.820526
1,5.790417,9.593648,0.0,9.43167,0.287235,0.259945,17.028748,3.037453,0.136341,0.0,...,1.67057,0.013891,10.338681,4.784722,0.18914,0.0,7.739144,0.372693,0.0,8.903802
2,0.0,3.162254,0.484691,0.172299,2.609548,3.840852,3.426139,0.253253,1.098907,1.418282,...,0.434926,14.092834,3.018744,2.134506,2.549062,0.377385,6.180348,0.63662,14.018165,11.415048
3,5.882203,1.958606,0.0,12.519882,0.0,0.0,9.383594,3.781614,0.0,1.490617,...,17.825478,0.0,3.592507,2.042305,0.0,0.0,6.656779,1.457314,1.694204,1.403327
4,0.245968,17.38343,0.493824,3.374468,2.567368,0.0,6.482537,5.862639,0.0,3.164866,...,4.464024,0.447075,1.448143,12.884138,0.0,0.397501,5.028075,1.939697,0.0,11.4854


In [10]:
df_embs_compressed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-106.875,-38.914272,0.338536,19.187342,14.934504,-24.085049,-7.463311,3.361398,21.677139,-9.196507,...,-3.791938,-3.070341,-0.637846,2.765099,-7.13383,-4.674678,-9.167339,2.57459,-0.271201,3.738719
1,-97.877342,-42.474228,-8.299099,-21.014223,30.329556,-65.437759,18.46756,12.433216,29.025728,13.179803,...,-0.34027,-3.068476,-0.945578,-4.649183,2.285379,-6.084108,-6.524863,-8.81234,-7.726312,-4.245975
2,24.955225,105.210426,-22.968132,69.64576,26.989901,-7.558082,29.480919,-29.77622,9.603487,-11.335951,...,-1.430069,-20.650574,-5.309267,1.480228,10.95741,-0.699188,5.457666,6.92782,-1.244179,-10.919445
3,-83.753204,-23.995764,-13.9615,-31.153566,28.798128,-46.638245,9.314443,17.460033,31.332096,24.376125,...,-8.659169,2.037807,3.822915,-10.76472,-3.695076,-6.654882,-8.889603,-7.959279,-13.363052,2.760697
4,-106.195808,-32.47509,-13.414531,-0.512059,15.809568,-40.974976,8.316986,5.49966,-25.244316,-11.114714,...,-6.731833,-2.915672,-5.815763,14.513758,2.124103,7.826849,0.79263,-1.91707,0.516684,0.058416


In [11]:
tsne = TSNE(n_components=2, verbose=0, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(df_embs_compressed)
df['x'] = tsne_results[:,0]
df['y'] = tsne_results[:,1]

In [18]:
print(df[df.season.isnull()])

Empty DataFrame
Columns: [id, gender, masterCategory, subCategory, articleType, baseColour, season, year, usage, productDisplayName, image, x, y]
Index: []


In [17]:
df.at[282, 'season'] = "Summer"

In [112]:
X = df[["x", "y"]].to_numpy()
y = df["season"].to_numpy()

svm_clf = SVC(kernel="linear", C=float("inf"))
svm_clf.fit(X, y)

In [1]:
svm_clf.coef_[0]

NameError: name 'svm_clf' is not defined

In [13]:
def create_axis():
    axis_color = 'k'
    x = np.linspace(-15, 15)
    y = np.linspace(0, 0)
    lines = plt.plot(x, y, axis_color, linewidth=2)

In [14]:
def create_user_marker(x=0):
    user_marker = plt.plot(x, 0, 'yo', markersize=10)
    user_position = np.array([x, 0])
    return user_marker, user_position

In [15]:
def get_nearest_neighbour(user_position, df):
   nearest_neighbour = None
   nearest_neighbour_pos = None
   smallest_dist = sys.maxsize
   for row in df.itertuples():
      embedding_position = np.array([row.x, row.y])
      dist = norm(user_position - embedding_position)
      if dist < smallest_dist:
         smallest_dist = dist
         nearest_neighbour = row.id
         nearest_neighbour_pos = embedding_position
   return nearest_neighbour, nearest_neighbour_pos

In [17]:
def highlight_nearest_neighbour(id, df):
    x_nn = df.loc[df['id'] == id].x.values[0]
    y_nn = df.loc[df['id'] == id].y.values[0]
    plt.scatter(x=x_nn, y=y_nn, color='r')

In [16]:
def update_user_position(change):
    user_marker[0].set_data(change.new, 0)
    new_user_pos = np.array([change.new, 0])
    nearest_neighbour, nearest_neighbour_pos = get_nearest_neighbour(new_user_pos, df)
    plt.title('Nearest Embedding: {}'.format(nearest_neighbour))
    fig.canvas.draw()
    fig.canvas.flush_events()

In [18]:
def annotate_nearest_neighbour(nearest_neighbour, nearest_neighbour_pos, ax, df):
    if ax.artists != []:
        ax.artists[0].remove()
    arr_img = plt.imread(get_img_path(df.loc[df['id'] == nearest_neighbour].image.values[0]))
    imagebox = OffsetImage(arr_img, zoom=0.02)
    imagebox.image.axes = ax
    ab = AnnotationBbox(imagebox, nearest_neighbour_pos, xybox=(-20, 40), xycoords='data', boxcoords="offset points", arrowprops=dict(arrowstyle="->"))
    ax.add_artist(ab)

In [20]:
print(df.loc[df['id'] == 25111].image.values[0])

25111.jpg


In [21]:
df.loc[df['id'] == 48114]

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName,image,x,y
2892,48114,Women,Accessories,Belts,Belts,Multi,Summer,2012,Casual,Fossil Women Multi Coloured Quilted Squares Belt,48114.jpg,11.629772,3.227715


In [22]:
from ipywidgets import AppLayout, FloatSlider
from matplotlib.offsetbox import (AnnotationBbox, OffsetImage, TextArea)

plt.ioff()

slider = FloatSlider(
    orientation="horizontal",
    description="Position:",
    value=0.0,
    min=-15.0,
    max=15.0
)

slider.layout.margin = '0px 30% 0px 30%'
slider.layout.width = '40%'

fig, ax = plt.subplots(figsize=(12,7))
fig.canvas.header_visible = False
fig.canvas.layout.min_height = '400px'
sns.scatterplot(x="x", y="y",
                  hue="masterCategory", # can be replaced with `subCategory` to see more details
                  data=df,
                  legend="full",
                  alpha=0.8)

create_axis()
user_marker, user_positon = create_user_marker()
nearest_neighbour, nearest_neighbour_pos = get_nearest_neighbour(user_positon, df)
annotate_nearest_neighbour(nearest_neighbour, nearest_neighbour_pos, ax, df)

plt.title('Nearest Embedding: {}'.format(nearest_neighbour))

def update_user_position(change):
    user_marker[0].set_data(change.new, 0)
    new_user_pos = np.array([change.new, 0])
    nearest_neighbour, nearest_neighbour_pos = get_nearest_neighbour(new_user_pos, df)
    annotate_nearest_neighbour(nearest_neighbour, nearest_neighbour_pos, ax, df)
    plt.title('Nearest Embedding: {}'.format(nearest_neighbour))
    fig.canvas.draw()
    fig.canvas.flush_events()

slider.observe(update_user_position, names='value')

AppLayout(
    center=fig.canvas,
    footer=slider,
    pane_heights=[0, 6, 1]
)

AppLayout(children=(FloatSlider(value=0.0, description='Position:', layout=Layout(grid_area='footer', margin='…