## Import libraries

In [12]:
import pandas as pd
import numpy as np 
from sklearn.manifold import TSNE

## Creating options (for dropdown menu later)

In [13]:
# Load the data 
#cosm_2 = pd.read_csv('cosmetic_p.csv')
df = pd.read_csv('body-care-1.csv')

In [16]:
option_1 = df.Label.unique().tolist()
print(option_1)


['bath-and-body-soap', 'body-moisturizers', 'sun-lotion', 'body-care', 'beauty-supplements-bath-body']


## Test block

In [4]:
option_1 = cosm_2.Label.unique().tolist()
# look through Label column
# unique values in list 

option_2 = cosm_2.columns[6:].tolist()
# all possible combinations for the option choices 

print(option_1)
print(option_2)

['Moisturizer', 'Cleanser', 'Treatment', 'Mask', 'Eye', 'SPF']
['Combination', 'Dry', 'Full', 'Light', 'Matte', 'Medium', 'Natural', 'Normal', 'Oily', 'Radiant', 'Sensitive']


In [5]:
# Example: Moisturizers for Dry skin

ex_opt1 = cosm_2.Label.unique().tolist()
ex_opt2 = ['Combination','Dry','Normal','Oily','Sensitive']

# Filter data by given options 
df = cosm_2[cosm_2['Label'] == 'Moisturizer'][cosm_2['Dry'] == 1]
df = df.reset_index() 

  import sys


## 1. Tokenizing ingredients list
## 2. Apply dimensionality reduction to reduce features

In [6]:
# tokenizing the list of ingredients in Ingredients column 
# after splitting into tokens, make a binary bag of words
# create dictionary with tokens, ingredient_idx 

def my_recommender(op_1, op_2): 
    df = cosm_2[cosm_2['Label'] == op_1][cosm_2[op_2] == 1]
    df = df.reset_index()
    
    # embedding each ingredient
    ingredient_idx = {}
    corpus = []
    idx = 0 
    
    for i in range(len(df)):
        ingred = df['ingredients'][i]
        ingred = ingred.lower()
        tokens = ingred.split(', ')
        corpus.append(tokens)
        
        # tokenized words put into corpus 
        
        for ingredient in tokens:
            if ingredient not in ingredient_idx:
                ingredient_idx[ingredient] = idx
                idx += 1
                
                # words put into dictionary 
                # checks duplication 
                # index increases by 1 
    
    # DTM (cosmetic-ingredient)
    # cosmetic product corresponds to a document
    # chemical composition corresponds to term 
    
    # number of items, tokens (M, N)
    M = len(df)
    N = len(ingredient_idx)
    
    # initialize matrix of zeros 
    A = np.zeros(shape = (M,N))
    
    # define one-hot encoder func
    # 1 if ingredient is in cosmetic, otherwise 0 
    def oh_enc(tokens):
        x = np.zeros(N)
        for t in tokens: 
            idx = ingredient_idx[t]
            # '1' at corresponding indices
            x[idx] = 1 
        return x 
    
    # apply oh_enc to tokens in corpus 
    # set values at each row of matrix 
    
    i = 0 
    for tokens in corpus:
        A[i, :]= oh_enc(tokens)
        i += 1
    
    # dimension reduction w t-SNE
    model = TSNE(n_components =2, learning_rate = 200)
    tsne_features = model.fit_transform(A) 
    
    # make X, Y columns
    df['X'] = tsne_features[:,0]
    df['Y'] = tsne_features[:,1]
    
    return df

In [9]:
# Create df for all combos 
df_all = pd.DataFrame() 
for op_1 in option_1:
    for op_2 in option_2:
        temp = my_recommender(op_1,op_2)
        temp['Label'] = op_1 + '_' + op_2
        df_all = pd.concat([df_all, temp])

  
  
  
  
  
  


ValueError: Found array with 1 sample(s) (shape=(1, 15)) while a minimum of 2 is required.

## Save file

In [10]:
df_all.to_csv('cosmetic_tsne.csv', encoding = 'utf-8-sig', index = False)