# Sephora Machine Learning Recommendation System
### Authors: Isabella Gonzales, Carine Wong, Libby Amir, Hajera Laique

## Code Appendix:

In [None]:
from sklearn.neural_network import MLPClassifier
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#importing reviews
df_1 = pd.read_csv('/content/drive/My Drive/ECS111 Term Project/reviews_0-250.csv')
df_2 = pd.read_csv('/content/drive/My Drive/ECS111 Term Project/reviews_250-500.csv')
df_3 = pd.read_csv('/content/drive/My Drive/ECS111 Term Project/reviews_500-750.csv')
df_4 = pd.read_csv('/content/drive/My Drive/ECS111 Term Project/reviews_750-1250.csv')
# ignore index for new cohesive indices in joined df
reviews_df = pd.concat([df_1, df_2, df_3, df_4], ignore_index=True)

  df_1 = pd.read_csv('/content/drive/My Drive/ECS111 Term Project/reviews_0-250.csv')
  df_4 = pd.read_csv('/content/drive/My Drive/ECS111 Term Project/reviews_750-1250.csv')


In [None]:
# Getting for every author the number of reviews they made
author_stats = reviews_df.groupby(['author_id'])['author_id'].count().reset_index(name='counts')

# Sorting by count
author_stats.sort_values('counts', ascending=False)

# Calculating cuttoff value
cutoff= author_stats['counts'].quantile(0.95)


# keeping authors with reveiw counts > cutoffvalue
filtered_authors = author_stats.loc[author_stats['counts'] > cutoff]

# Converting to set only the author_id column
authors = filtered_authors['author_id'].squeeze()

# Keeping reviews from the authors in the set
reviews_df = reviews_df.loc[reviews_df['author_id'].isin(authors)]

In [None]:
# importing product info
df_p = pd.read_csv('/content/drive/My Drive/ECS111 Term Project/product_info.csv')
df_p = df_p[df_p['reviews'] >= 100]
# select observations that fall under skincare and makeup
df_p = df_p[df_p['primary_category'].isin(['Skincare','Makeup'])]

In [None]:
# count the number of products with missing information
num_missing = df_p.isna().sum()
num_missing

product_id               0
product_name             0
brand_id                 0
brand_name               0
loves_count              0
rating                   0
reviews                  0
size                   356
variation_type         262
variation_value        314
variation_desc        2004
ingredients            165
price_usd                0
value_price_usd       2812
sale_price_usd        2786
limited_edition          0
new                      0
online_only              0
out_of_stock             0
sephora_exclusive        0
highlights             616
primary_category         0
secondary_category       0
tertiary_category      188
child_count              0
child_max_price       1529
child_min_price       1529
dtype: int64

In [None]:
# create new dataframe for subset of product info
products = pd.DataFrame(df_p, columns=['product_id','product_name','ingredients','highlights', 'secondary_category', 'brand_name'])
# view top of df and dimensions
products.head(5)
products.shape

(2858, 6)

In [None]:
# remove products with missing highlights values because they lack necessary information
products = products.dropna(subset='highlights')
products = products.dropna(subset = 'secondary_category')
products.shape

(2242, 6)

In [None]:
# resetting indices after removing na observations
products = products.reset_index(drop=True)
# create a series that can be indexed using the product's name
indices = pd.Series(products.index, index=products['product_name'])
# view the number of products after part of the data cleaning
indices

product_name
Rose Lip Conditioner                                                            0
GENIUS Sleeping Collagen Moisturizer                                            1
GENIUS Liquid Collagen Serum                                                    2
GENIUS Liquid Collagen Lip Treatment                                            3
SUBLIME DEFENSE Ultra Lightweight UV Defense Fluid SPF 50                       4
                                                                             ... 
The Curler Lengthening and Curling Mascara                                   2237
NU LIP & CHEEK BALMY TINT with Hyaluronic Acid                               2238
NU GLOW IN BALM Face Priming Moisturizer with Shea Butter                    2239
NU BLOTTING LOTION Pore Minimizer & Mattifying Primer with Salicylic Acid    2240
NU TONE CORRECTOR Color Corrector with Vitamin E                             2241
Length: 2242, dtype: int64

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# initialize the vectorizer for TF-IDF
vectorizer = TfidfVectorizer()

In [None]:
# obtain the text within the highlights column
texts = products.highlights.values

# create a TF-IDF matrix with the text
tfidf_matrix = vectorizer.fit_transform(texts)

# get dimensions of the matrix, each row representing a product and column representing a word
# IF-IDF contains its location within the matrix and its TF*IDF value
tfidf_matrix.shape

(2242, 141)

In [None]:
from sklearn.metrics.pairwise import linear_kernel


# calculate the cosine similarity matrix
cosine_similarity = linear_kernel(tfidf_matrix, tfidf_matrix)

### First Iteration:

In [None]:
# function that takes name of one product and recommends 10 similar products
def get_recommendations(product_name, cosine_sim=cosine_similarity):
    # get index based on product name
    idx = indices[product_name]

    # get the similarity scores for the product compared against all other products (including iteself)
    sim_scores = list(enumerate(cosine_similarity[idx]))

    # sort the products based on their similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get the scores of the 10 most similar products
    sim_scores = sim_scores[1:11]

    # get the indices of the 10 most similar products
    product_indices = [i[0] for i in sim_scores]

    # return the names of the 10 most similar products (in order of similarity)
    return products['product_name'].iloc[product_indices]

In [None]:
# when calling for recommendations, acknowledge that spelling and grammar are crucial
get_recommendations("Lip Sleeping Mask Intense Hydration with Vitamin C", cosine_similarity)

297          Clinique iD Custom-Blend Hydrator Collection
1230                   Hydro Grip Hydrating Makeup Primer
2008              Hyaluronic Acid 2% + B5 Hydrating Serum
2206            Mini Superberry Hydrate + Glow Dream Mask
779                       Vanish Flash Highlighting Stick
840               Mini Limitless Lash Lengthening Mascara
1014                      Tinted Face Oil Comfy Skin Tint
1244               Sunshine Vitamin C + Squalane Face Oil
2205    Superberry Hydrate + Glow Dream Night Mask wit...
1716       Synchro Skin Self-Refreshing Foundation SPF 30
Name: product_name, dtype: object

### Second Iteration:

In [None]:
# function that takes name of one product and recommends 10 similar products
def get_recommendations2(name, cosine_sim=cosine_similarity):
    # get category of product
    category = products.loc[products['product_name'] == name, 'secondary_category'].values[0]

    # filter products to only include those in the same category
    product_category_subset = products[products['secondary_category'] == category]

    #make a vector of the product indices that are in the same category as the query
    same_cat_indices = products[products["secondary_category"] == category].index

    # create a series that can be indexed using the product's name
    indices = pd.Series(product_category_subset.index, index=product_category_subset['product_name'])

    # get index based on product name
    idx = indices[name]

    # get the similarity scores for the product compared against all other products (including iteself)
    sim_scores = list(enumerate(cosine_similarity[idx]))

    # sort the products based on their similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get the indices of the 10 most similar products
    product_indices = [i[0] for i in sim_scores]
    product_indices_filtered = []

    #if product_indices is in the same category as the query, we append it to product_indices_filtered
    for i in product_indices:
        if i in same_cat_indices:
            product_indices_filtered.append(i)

    #we want to output 10 recommendations, or the total number of products in that category, whichever one is smaller
    num_recommendations = min(10, len(product_indices_filtered))
    product_indices_filtered = product_indices_filtered[1:num_recommendations+1]

    # return the names of the 10 most similar products filtered by same product category (in order of similarity)
    return products['product_name'].iloc[product_indices_filtered]

In [None]:
# when calling for recommendations2, we can compare the results with recommendations
get_recommendations2("Lip Sleeping Mask Intense Hydration with Vitamin C", cosine_similarity)

1101                                       Lip Glowy Balm
1956                                   The Kissu Lip Mask
160                         Squalane+ Rose Vegan Lip Balm
616              Sugar Recovery Lip Mask Advanced Therapy
617              Sugar Mint Rush Freshening Lip Treatment
1394                  Pout Preserve Peptide Lip Treatment
1105                                   Lip Treatment Balm
607                    Sugar Lip Balm Hydrating Treatment
596     Sugar Advanced Lip Balm Intense Hydration Trea...
1667                               Clean Lip Balm & Scrub
Name: product_name, dtype: object

### Third Iteration:

In [None]:
# function that takes name of one product and recommends 10 similar products
def get_recommendations3(name, brand_name, cosine_sim=cosine_similarity):

    # get category of product
    category = products.loc[products['product_name'] == name, 'secondary_category'].values[0]

    # filter products to only include those in the same category
    product_category_subset = products[products['secondary_category'] == category]

    #make a vector of the product indices that are in the same category as the query
    same_cat_indices = products[products["secondary_category"] == category].index

    # create a series that can be indexed using the product's name
    indices = pd.Series(product_category_subset.index, index=product_category_subset['product_name'] + ' - ' + product_category_subset['brand_name'])

    #get index based on product name and brand name
    name_and_brand = name + " - " + brand_name
    idx = indices[name_and_brand]

    # get the similarity scores for the product compared against all other products (including iteself)
    sim_scores = list(enumerate(cosine_similarity[idx]))

    # sort the products based on their similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get the indices of the 10 most similar products
    product_indices = [i[0] for i in sim_scores]
    product_indices_filtered = []

    #if product_indices is in the same category as the query, we append it to product_indices_filtered
    for i in product_indices:
        if i in same_cat_indices:
            product_indices_filtered.append(i)

    #we want to output 10 recommendations, or the total number of products in that category, whichever one is smaller
    num_recommendations = min(10, len(product_indices_filtered))
    product_indices_filtered = product_indices_filtered[1:num_recommendations+1]

    # return the names of the 10 most similar products filtered by same product category (in order of similarity)
    print("Recommendations for", name+":")
    recommendations = products.iloc[product_indices_filtered][['product_name', 'brand_name']]
    return recommendations

In [None]:
# when calling for recommendations3, we can now see the brand of every product
get_recommendations3("Lip Sleeping Mask Intense Hydration with Vitamin C", "LANEIGE", cosine_similarity)

Recommendations for Lip Sleeping Mask Intense Hydration with Vitamin C:


Unnamed: 0,product_name,brand_name
1101,Lip Glowy Balm,LANEIGE
1956,The Kissu Lip Mask,Tatcha
160,Squalane+ Rose Vegan Lip Balm,Biossance
616,Sugar Recovery Lip Mask Advanced Therapy,fresh
617,Sugar Mint Rush Freshening Lip Treatment,fresh
1394,Pout Preserve Peptide Lip Treatment,OLEHENRIKSEN
1105,Lip Treatment Balm,LANEIGE
607,Sugar Lip Balm Hydrating Treatment,fresh
596,Sugar Advanced Lip Balm Intense Hydration Trea...,fresh
1667,Clean Lip Balm & Scrub,SEPHORA COLLECTION


### Fourth Iteration:

In [None]:
# only products with reviews are in the product data frame
common_product_ids = set(df_p['product_id']).intersection(set(reviews_df['product_id']))
df_p = df_p[df_p['product_id'].isin(common_product_ids)]
reviews_df = reviews_df[reviews_df['product_id'].isin(common_product_ids)]

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Lowercasing
    tokens = [token.lower() for token in tokens]

    # Removing punctuation
    tokens = [token for token in tokens if token not in string.punctuation]

    # Removing stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove non-word characters, numbers, special characters, and emojis
    tokens = [re.sub(r'[^\w\s]', '', token) for token in tokens]

    # Stemming or Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
#define filtered
reviews_df_filtered = reviews_df.dropna()

# get the number of reviews for every author
author_stats = reviews_df_filtered.groupby(['author_id'])['product_id'].count().reset_index(name='counts')

merged_data = pd.merge(reviews_df_filtered, author_stats, on='author_id', how='left')

# Filter authors with less than 10 reviews
reviews_df_filtered = merged_data[merged_data['counts'] >= 10]

# Drop the 'counts' column if you don't need it anymore
reviews_df_filtered.drop('counts', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df_filtered.drop('counts', axis=1, inplace=True)


In [None]:
#review text preprocessing

#reviewText = reviews_df_filtered.loc[:, ["review_text"]]
#reviewText_preprocessed = []
#for i in range(0,(len(reviews_df_filtered))):
#    text = (reviewText.iloc[i,:].values[0])
#    preprocessed_text = preprocess_text(text)
#    reviewText_preprocessed.append(preprocessed_text)

In [None]:
#make a column for the preprocessed review texts
#from sklearn.model_selection import train_test_split
#sentences = [' '.join(words) for words in reviewText_preprocessed]
#reviews_df_filtered.loc[:, "review_text_preprocessed"] = sentences #create a new review_text_preprocessed column
#reviews_df_filtered.head()

In [None]:
# group reviews by product_id and vectorize them
grouped_reviews = reviews_df_filtered.groupby('product_id')['review_text'].apply(list).reset_index()
grouped_reviews.rename(columns={'review_text': 'reviews_vector'}, inplace=True)
# add this column to the product df called df_p
df_p = pd.merge(df_p, grouped_reviews, on='product_id', how='inner')
df_p.head()

# Assuming 'vector_column' is the name of the column containing the list of strings
df_p['reviews_vector'] = df_p['reviews_vector'].apply(lambda x: ' '.join(x))


In [None]:
# Assuming df_p is your DataFrame and preprocess_text is a function to preprocess text
reviewText = df_p["reviews_vector"]  # Accessing the "reviews_vector" column directly
reviewText_preprocessed = []

# Assuming preprocess_text function is defined correctly
for i in range(len(df_p)):
    text = reviewText.iloc[i]  # Accessing each element in the column
    preprocessed_text = preprocess_text(text)
    reviewText_preprocessed.append(preprocessed_text)


In [None]:
#make a column for the preprocessed review texts
sentences = [' '.join(words) for words in reviewText_preprocessed]
df_p.loc[:, "review_text_preprocessed"] = sentences #create a new review_text_preprocessed column
df_p.head()

Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating,reviews,size,variation_type,variation_value,...,sephora_exclusive,highlights,primary_category,secondary_category,tertiary_category,child_count,child_max_price,child_min_price,reviews_vector,review_text_preprocessed
0,P439055,GENIUS Sleeping Collagen Moisturizer,6018,Algenist,33910,4.5413,1321.0,2 oz/ 60 mL,Size,2 oz/ 60 mL,...,1,"['Vegan', 'Good for: Loss of firmness', 'Colla...",Skincare,Moisturizers,Moisturizers,0,,,I am so disappointed with this cream! It does ...,disappointed cream absorb actually irritated s...
1,P421277,GENIUS Liquid Collagen Serum,6018,Algenist,67870,4.0259,1159.0,1 oz / 30 mL,Size,1 oz / 30 mL,...,1,"['Vegan', 'Good for: Loss of firmness', 'Colla...",Skincare,Treatments,Face Serums,1,25.0,25.0,I got a sample of this and thought I’d try it ...,got sample thought try even though listed per...
2,P467602,Triple Algae Eye Renewal Balm Eye Cream,6018,Algenist,17890,4.5306,1142.0,,,,...,1,,Skincare,Eye Care,Eye Creams & Treatments,0,,,I’m glad I waited to write this because this g...,glad waited write grew big time let name foo...
3,P432045,GENIUS Liquid Collagen Lip Treatment,6018,Algenist,44448,3.8721,649.0,.5 oz / 15 mL,Size,.5 oz / 15 mL,...,1,"['Vegan', 'Good for: Loss of firmness', 'Plump...",Skincare,Lip Balms & Treatments,,0,,,It does what it says - and it instantly plumps...,say instantly plump flood lip hydration howev...
4,P311143,SUBLIME DEFENSE Ultra Lightweight UV Defense F...,6018,Algenist,27278,4.4134,508.0,1 oz,Size,1 oz,...,0,"['Vegan', 'Hypoallergenic', 'UV Protection', '...",Skincare,Sunscreen,Face Sunscreen,0,,,"Silky, slightly thin consistency that never pi...",silky slightly thin consistency never pill ski...


In [None]:
# Create new column that combines highlights and reviews
df_p["highlights_reviews"] = df_p["highlights"] + " " + df_p["review_text_preprocessed"]


In [None]:
# create new dataframe for subset of product info
products_new = pd.DataFrame(df_p, columns=['product_id', 'product_name', 'ingredients', 'highlights_reviews', 'secondary_category', 'brand_name'])
# view top of df and dimensions
products_new.head(5)
products_new.shape

products_new = products_new.dropna(subset = 'secondary_category')
products_new = products_new.dropna(subset = "highlights_reviews")
products_new.shape

# resetting indices after removing na observations
products_new = products_new.reset_index(drop=True)
# create a series that can be indexed using the product's name
indices_new = pd.Series(products_new.index, index=products_new['product_name'])
# view the number of products after part of the data cleaning
indices_new

texts_new = products_new.highlights_reviews.values

# create a TF-IDF matrix with the text
tfidf_matrix_new = vectorizer.fit_transform(texts_new)

# calculate the cosine similarity matrix
cosine_similarity_new = linear_kernel(tfidf_matrix_new, tfidf_matrix_new)


In [None]:
# function that takes name of one product and recommends 10 similar products
def get_recommendations4(name, brand_name, cosine_sim=cosine_similarity_new):
    # get category of product

    # Filter rows based on the condition
    filtered_rows = products_new.loc[products_new['product_name'] == name, 'secondary_category']

    # Get the category value (if it exists)
    if not filtered_rows.empty:
        category = filtered_rows.values[0]
    else:
        print("No matching category found for the product name:", name)

    # filter products to only include those in the same category
    product_category_subset = products_new[products_new['secondary_category'] == category]

    #make a vector of the product indices that are in the same category as the query
    same_cat_indices = products_new[products_new["secondary_category"] == category].index

    # create a series that can be indexed using the product's name
    indices = pd.Series(product_category_subset.index, index=product_category_subset['product_name'] + ' - ' + product_category_subset['brand_name'])

    #get index based on product name and brand name
    name_and_brand = name + " - " + brand_name
    idx = indices[name_and_brand]

    # get the similarity scores for the product compared against all other products (including iteself)
    sim_scores = list(enumerate(cosine_similarity_new[idx]))

    # sort the products based on their similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get the indices of the 10 most similar products
    product_indices = [i[0] for i in sim_scores]
    product_indices_filtered = []

    #if product_indices is in the same category as the query, we append it to product_indices_filtered
    for i in product_indices:
        if i in same_cat_indices:
            product_indices_filtered.append(i)

    #we want to output 10 recommendations, or the total number of products in that category, whichever one is smaller
    num_recommendations = min(10, len(product_indices_filtered))
    product_indices_filtered = product_indices_filtered[1:num_recommendations+1]

    # return the names of the 10 most similar products filtered by same product category (in order of similarity)
    print("Recommendations for", name+":")
    recommendations = products_new.iloc[product_indices_filtered][['product_name', 'brand_name']]
    return recommendations

In [None]:
# when calling for recommendations4, we get recommendations based on highlights and their reviews
get_recommendations4("Lip Sleeping Mask Intense Hydration with Vitamin C", "LANEIGE")

Recommendations for Lip Sleeping Mask Intense Hydration with Vitamin C:


Unnamed: 0,product_name,brand_name
486,Lip Glowy Balm,LANEIGE
489,Lip Treatment Balm,LANEIGE
814,The Kissu Lip Mask,Tatcha
295,Sugar Recovery Lip Mask Advanced Therapy,fresh
43,Squalane+ Rose Vegan Lip Balm,Biossance
203,Lippe Balm,Drunk Elephant
719,Brazilian Kiss Cupuaçu Lip Butter,Sol de Janeiro
666,Clean Lip Balm & Scrub,SEPHORA COLLECTION
234,Honey Butter Beeswax Lip Balm,Farmacy
18,Willow & Sweet Agave Plumping Lip Mask,alpyn beauty


### Fifth Iteration:

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

columns_to_keep = ['skin_tone', 'eye_color', 'skin_type', 'product_name', 'rating']
reviews_df_subset = reviews_df[columns_to_keep]

X = reviews_df_subset.drop(columns=['rating'])
y = reviews_df_subset['rating']

# OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

columns_to_keep = ['skin_tone', 'eye_color', 'skin_type', 'product_name', 'rating']
reviews_df_subset = reviews_df[columns_to_keep]

X = reviews_df_subset.drop(columns=['rating'])
y = reviews_df_subset['rating']

# OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore')
X_encoded = encoder.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# accuracy measure
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)

print("Mean Squared Error:", mse)


Mean Squared Error: 0.9294080006092733


In [None]:
# testing the model with specific values
new_data = {
    'skin_tone': ['light'],
    'eye_color': ['brown'],
    'skin_type': ['oily'],
    'product_name': ['Bionic Glow Illuminating Liquid Highlighter with Hyaluronic Acid']
}

# creating df
new_data_df = pd.DataFrame(new_data)
new_data_encoded = encoder.transform(new_data_df)

# predict the rating for the new data
predicted_rating = model.predict(new_data_encoded)

print("Predicted Rating:", predicted_rating)

Predicted Rating: [4.2992382]


In [None]:
# testing the model with specific values
new_data = {
    'skin_tone': ['light'],
    'eye_color': ['brown'],
    'skin_type': ['dry'],
    'product_name': ['Bionic Glow Illuminating Liquid Highlighter with Hyaluronic Acid']
}

# creating df
new_data_df = pd.DataFrame(new_data)
new_data_encoded = encoder.transform(new_data_df)

# predict the rating for the new data
predicted_rating = model.predict(new_data_encoded)

print("Predicted Rating:", predicted_rating)

Predicted Rating: [4.31259815]


In [None]:
# testing the model with specific values
new_data = {
    'skin_tone': ['fair'],
    'eye_color': ['brown'],
    'skin_type': ['dry'],
    'product_name': ['GENIUS Liquid Collagen Serum']
}

# creating df
new_data_df = pd.DataFrame(new_data)
new_data_encoded = encoder.transform(new_data_df)

# predict the rating for the new data
predicted_rating = model.predict(new_data_encoded)

print("Predicted Rating:", predicted_rating)

Predicted Rating: [4.50844373]


In [None]:
# A function for this above regression model
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

def predict_rating(skin_tone, eye_color, skin_type, product_name, model, encoder):
    new_data = {
        'skin_tone': [skin_tone],
        'eye_color': [eye_color],
        'skin_type': [skin_type],
        'product_name': [product_name]
    }
    new_data_df = pd.DataFrame(new_data)

    new_data_encoded = encoder.transform(new_data_df)

    predicted_rating = model.predict(new_data_encoded)

    return predicted_rating[0]

predict_rating('deep', 'brown', 'dry', 'Cicapair Tiger Grass Color Correcting Treatment SPF 30', model, encoder)
predict_rating('fair', 'brown', 'dry', 'Cicapair Tiger Grass Color Correcting Treatment SPF 30', model, encoder)

3.94241924204499

In [None]:
def get_recommendations5(skin_tone, eye_color, skin_type, name, brand_name, model=model, encoder=encoder, cosine_sim=cosine_similarity_new):
    # get category of product

    # Filter rows based on the condition
    filtered_rows = products_new.loc[products_new['product_name'] == name, 'secondary_category']

    # Get the category value (if it exists)
    if not filtered_rows.empty:
        category = filtered_rows.values[0]
    else:
        print("No matching category found for the product name:", name)
        return None

    # filter products to only include those in the same category
    product_category_subset = products_new[products_new['secondary_category'] == category]

    # make a vector of the product indices that are in the same category as the query
    same_cat_indices = products_new[products_new["secondary_category"] == category].index

    # create a series that can be indexed using the product's name
    indices = pd.Series(product_category_subset.index, index=product_category_subset['product_name'] + ' - ' + product_category_subset['brand_name'])

    # get index based on product name and brand name
    name_and_brand = name + " - " + brand_name
    idx = indices[name_and_brand]

    # get the similarity scores for the product compared against all other products (including itself)
    sim_scores = list(enumerate(cosine_similarity_new[idx]))

    # sort the products based on their similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get the indices of the 10 most similar products
    product_indices = [i[0] for i in sim_scores]
    product_indices_filtered = []

    # if product_indices is in the same category as the query, we append it to product_indices_filtered
    for i in product_indices:
        if i in same_cat_indices:
            product_indices_filtered.append(i)

    # we want to output 10 recommendations, or the total number of products in that category, whichever one is smaller
    num_recommendations = min(10, len(product_indices_filtered))
    product_indices_filtered = product_indices_filtered[1:num_recommendations + 1]

    # create a DataFrame with recommendations
    recommendations_df = products_new.iloc[product_indices_filtered][['product_name', 'brand_name']]

    # add a new column for predicted ratings
    recommendations_df['predicted_rating'] = recommendations_df.apply(
        lambda row: predict_rating(skin_tone, eye_color, skin_type, row['product_name'], model, encoder), axis=1)

    # return the DataFrame with recommendations and predicted ratings
    print("Recommendations for", name + ":")
    return recommendations_df

In [None]:
get_recommendations5('tan', 'brown', 'dry', "Lip Sleeping Mask Intense Hydration with Vitamin C", "LANEIGE")

Recommendations for Lip Sleeping Mask Intense Hydration with Vitamin C:


Unnamed: 0,product_name,brand_name,predicted_rating
486,Lip Glowy Balm,LANEIGE,4.312995
489,Lip Treatment Balm,LANEIGE,4.257591
814,The Kissu Lip Mask,Tatcha,3.64611
295,Sugar Recovery Lip Mask Advanced Therapy,fresh,4.718539
43,Squalane+ Rose Vegan Lip Balm,Biossance,4.264761
203,Lippe Balm,Drunk Elephant,3.876951
719,Brazilian Kiss Cupuaçu Lip Butter,Sol de Janeiro,4.031013
666,Clean Lip Balm & Scrub,SEPHORA COLLECTION,4.354718
234,Honey Butter Beeswax Lip Balm,Farmacy,4.219961
18,Willow & Sweet Agave Plumping Lip Mask,alpyn beauty,4.599506


In [None]:
get_recommendations5('tan', 'brown', 'dry', "Retinol Anti-Aging Serum", "The INKEY List")

Recommendations for Retinol Anti-Aging Serum:


Unnamed: 0,product_name,brand_name,predicted_rating
515,Retinol Youth Renewal Serum,Murad,4.667177
205,A-Passioni Retinol Cream,Drunk Elephant,4.092556
584,CLINICAL 1% Retinol Treatment,Paula's Choice,4.674557
677,Retinol Reform Treatment Serum,Shani Darden Skin Care,4.288439
242,1% Vitamin A Retinol Serum,Farmacy,4.737337
271,FAB Skin Lab Retinol Serum 0.25% Pure Concentrate,First Aid Beauty,3.681529
598,Retinol Face Stick,Peace Out,4.570142
431,Micro-Dose Anti-Aging Retinol Serum with Ceram...,Kiehl's Since 1851,4.451425
763,A+ High-Dose Retinol Serum,Sunday Riley,3.970229
406,Argan Beta Retinol Pink Algae Serum,Josie Maran,4.307496


In [None]:
get_recommendations5('tan', 'brown', 'dry', "Soy Hydrating Gentle Face Cleanser", "fresh")

Recommendations for Soy Hydrating Gentle Face Cleanser:


Unnamed: 0,product_name,brand_name,predicted_rating
394,Confidence in a Cleanser Hydrating Facial Clea...,IT Cosmetics,4.22364
40,Squalane + Amino Aloe Gentle Pore-Minimizing C...,Biossance,4.146985
908,Superfood Antioxidant Cleanser,Youth To The People,4.165284
846,Fulvic Acid Brightening Cleanser,The INKEY List,4.72824
847,Mini Fulvic Acid Brightening Cleanser,The INKEY List,4.824334
315,Blueberry Bounce Gentle Cleanser,Glow Recipe,3.887933
377,Keep It Clean Hydrating Gel Cleanser with Cera...,iNNBEAUTY PROJECT,4.568328
77,Vinoclean Gentle Foam Cleanser,Caudalie,4.333074
803,The Rice Wash Skin-Softening Cleanser,Tatcha,4.344725
903,Yo Glow AHA & BHA Facial Enzyme Scrub,Wishful,4.042546
