## Import libraries

In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity 

from bokeh.io import show, curdoc, output_notebook, push_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool, Select, Paragraph, TextInput
from bokeh.layouts import widgetbox, column, row
from ipywidgets import interact 

In [9]:
df = pd.read_csv('body_tsne_2.csv')
df.drop(['level_0','index'], axis = 1, inplace = True)

In [10]:
df.head(5)

Unnamed: 0,Label,brand,name,price,ingredients,Combination,Dry,Normal,Oily,X,Y
0,soap_Combination,FIRST AID BEAUTY,KP Bump Eraser Body Scrub with 10% AHA,['28'],-Pumice Buffing Beads: Exfoliate particles to ...,1,0,0,0,-113.32615,97.186554
1,soap_Combination,HERBIVORE,Coco Rose Exfoliating Body Scrub,['36'],-Virgin Coconut Oil: Provides intensive hydrat...,1,1,1,1,-93.9779,4.179508
2,soap_Combination,HERBIVORE,Coconut Milk Bath Soak,['18'],"-Coconut Milk Powder: Hydrates and soothes, le...",1,1,1,1,-184.89255,19.795685
3,soap_Combination,MOROCCANOIL,Shower Gel,['30'],-Argan Oil: Extremely rich in tocopherols (vit...,1,1,1,1,72.01765,74.65067
4,soap_Combination,CAUDALIE,Crushed Cabernet Scrub,['38'],-Crushed Grape Seeds and Brown Sugar: Exfoliat...,1,1,1,1,-20.93067,128.42899


## Visualization 

In [11]:
df.Label.unique()
# option1_option2 combinations (5*4 = 20)

# beauty supplements are not just for body - no skin type labels

array(['soap_Combination', 'soap_Dry', 'soap_Normal', 'soap_Oily',
       'moisturizers_Combination', 'moisturizers_Dry',
       'moisturizers_Normal', 'moisturizers_Oily', 'SPF_Combination',
       'SPF_Dry', 'SPF_Normal', 'SPF_Oily', 'body_Combination',
       'body_Dry', 'body_Normal', 'body_Oily'], dtype=object)

In [12]:
option_1 = ['soap', 'moisturizers', 'SPF', 'body'] #supplements
option_2 = ['Combination', 'Dry', 'Normal', 'Oily']

In [13]:
# make connection
output_notebook()

In [14]:
# make source
# scatter bokeh plot

source = ColumnDataSource(df)

plot = figure(x_axis_label = 'TSNE1', y_axis_label = 'TSNE2', 
             width = 500, height = 400, tools = 'pan, box_zoom, wheel_zoom')

plot.circle(x= 'X', y = 'Y', source = source, size = 10, color = "#FA8072", alpha = .8)
plot.background_fill_color = "#FAEBD7"
plot.background_fill_alpha = 0.1

In [15]:
# add hover tool 

hover = HoverTool(tooltips = [
    ('Item', '@name'),
    ('brand', '@brand'),
    ('Price', '$ @price')
])

plot.add_tools(hover)

In [16]:
def update(op1 = option_1[0], op2 = option_2[0]):
    a_b = op1 + '_' + op2
    new_data = {
        'X': df[df['Label'] == a_b]['X'],
        'Y': df[df['Label'] == a_b]['Y'],
        'name': df[df['Label'] == a_b]['name'],
        'brand': df[df['Label'] == a_b]['brand'],
        'price': df[df['Label'] == a_b]['price']
    }
    source.data = new_data
    push_notebook()

In [17]:
output_notebook()

interact(update, op1 = option_1, op2 = option_2)
show(plot, notebook_handle = True)

interactive(children=(Dropdown(description='op1', options=('soap', 'moisturizers', 'SPF', 'body'), value='soap…

In [11]:
# a plane for type of product vs price range

## Cosine Similarity

In [18]:
df_test = df[df.Label == 'soap_Combination'].reset_index()
df_test['dist'] = 0.0

In [19]:
myItem = df_test[df_test.name.str.contains('Coco Rose Exfoliating Body Scrub')]
myItem

Unnamed: 0,index,Label,brand,name,price,ingredients,Combination,Dry,Normal,Oily,X,Y,dist
1,1,soap_Combination,HERBIVORE,Coco Rose Exfoliating Body Scrub,['36'],-Virgin Coconut Oil: Provides intensive hydrat...,1,1,1,1,-93.9779,4.179508,0.0


In [21]:
p1 = np.array([myItem.X.values, myItem.Y.values]).reshape(1, -1)
p1

array([[-93.9779   ,   4.1795077]])

In [22]:
# cosine similarities with other items
for i in range(len(df_test)):
    p2 = np.array([df_test['X'][i], df_test['Y'][i]]).reshape(-1, 1)
    df_test.dist[i] = (p1 * p2).sum() / (np.sqrt(np.sum(p1))*np.sqrt(np.sum(p2)))

  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [23]:
df_test = df_test.sort_values('dist')
df_test[['name', 'brand', 'dist']].head(5)

Unnamed: 0,name,brand,dist
0,KP Bump Eraser Body Scrub with 10% AHA,FIRST AID BEAUTY,
1,Coco Rose Exfoliating Body Scrub,HERBIVORE,
2,Coconut Milk Bath Soak,HERBIVORE,
3,Shower Gel,MOROCCANOIL,
4,Crushed Cabernet Scrub,CAUDALIE,
