In [27]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv('skindataall.csv')

df = df[['Product', 'Brand', 'Ing_Tfidf']].drop_duplicates()

df.reset_index(drop=True, inplace=True)

tfidf = tf=TfidfVectorizer(analyzer='word',ngram_range=(1,2),min_df=0.0,stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['Ing_Tfidf'])

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

indices = pd.Series(df.index, index=df['Product']).drop_duplicates()

def get_recommendations(product_name, cosine_sim=cosine_sim):
    if product_name not in indices:
        return "Product not found in the dataset."

    idx = indices[product_name]

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:11]

    product_indices = [i[0] for i in sim_scores]

    return df[['Product', 'Brand']].iloc[product_indices].copy().assign(Similarity=[i[1] for i in sim_scores])

product_name = 'Superfood Antioxidant Cleanser'
recommendations = get_recommendations(product_name)

print(recommendations)


                                         Product                Brand  \
279  Superfood Firm and Brighten Vitamin C Serum  YOUTH TO THE PEOPLE   
160          Deep Dive™ Glycolic Facial Cleanser           JACK BLACK   
161            Jasmine Green Tea Balancing Toner            HERBIVORE   
54           Pink Cloud Rosewater Moisture Crème            HERBIVORE   
167                  8% Glycolic Solutions Toner    PETER THOMAS ROTH   
208                      Hydrating Essence Toner             ALGENIST   
127       Clean Bee Ultra Gentle Facial Cleanser              FARMACY   
269               Max Complexion Correction Pads    PETER THOMAS ROTH   
187                       Problem Solution Toner                BELIF   
108                      Purifying Cleansing Gel               BOSCIA   

     Similarity  
279    0.578306  
160    0.248817  
161    0.191711  
54     0.185634  
167    0.185286  
208    0.171807  
127    0.170005  
269    0.158574  
187    0.158070  
108    0.152752 

In [28]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
df = pd.read_csv('skindataall.csv')

# Preprocess the dataset: Select the necessary columns and drop duplicates
df = df[['Product', 'Brand', 'Ing_Tfidf','Category']].drop_duplicates()
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''
df['Brand'] = df['Brand'].apply(clean_data)
df['combined_features'] = df.apply(lambda x: f" {x['Ing_Tfidf']} {x['Category']} ", axis=1)
# Reset index to ensure indices match the rows in the cosine similarity matrix
df.reset_index(drop=True, inplace=True)

# Vectorize the 'Ing_Tfidf' column
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

# Calculate cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Print a part of the cosine similarity matrix for verification
print("Cosine Similarity Matrix Sample:")
print(cosine_sim[0,54])  # Print a 5x5 sample of the matrix

# Create a Series to map product names to their indices
indices = pd.Series(df.index, index=df['Product']).drop_duplicates()

# Function to get top 10 similar items
def get_recommendations(product_name, cosine_sim=cosine_sim):
    if product_name not in indices:
        return "Product not found in the dataset."

    # Get the index of the product that matches the product name
    idx = indices[product_name]

    # Get the pairwise similarity scores of all products with that product
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the products based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Print the similarity scores for verification
    print(f"Similarity Scores for '{product_name}':")
    for i, score in sim_scores[:11]:  # Include the product itself for reference
        print(f"Index: {i}, Product: {df['Product'].iloc[i]}, Similarity: {score}")

    # Get the scores of the 10 most similar products (excluding the first one, which is the same product)
    sim_scores = sim_scores[1:11]

    # Get the product indices
    product_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar products
    return df[['Product', 'Brand']].iloc[product_indices].copy().assign(Similarity=[i[1] for i in sim_scores])

# Example usage
product_name = 'Superfood Antioxidant Cleanser'
recommendations = get_recommendations(product_name)

# Print the recommendations
print("\nTop 10 Recommendations:")
print(recommendations)


Cosine Similarity Matrix Sample:
0.26537419833154907
Similarity Scores for 'Superfood Antioxidant Cleanser':
Index: 0, Product: Superfood Antioxidant Cleanser, Similarity: 1.0000000000000002
Index: 279, Product: Superfood Firm and Brighten Vitamin C Serum, Similarity: 0.7060893154585085
Index: 160, Product: Deep Dive™ Glycolic Facial Cleanser, Similarity: 0.3418590359922098
Index: 167, Product: 8% Glycolic Solutions Toner, Similarity: 0.3009440857143475
Index: 299, Product: Pure Skin® Clarifying Dietary Supplement, Similarity: 0.2960289879796117
Index: 208, Product: Hydrating Essence Toner, Similarity: 0.2677126070589102
Index: 54, Product: Pink Cloud Rosewater Moisture Crème, Similarity: 0.26537419833154907
Index: 187, Product: Problem Solution Toner, Similarity: 0.26342275371464596
Index: 127, Product: Clean Bee Ultra Gentle Facial Cleanser, Similarity: 0.26310477926041226
Index: 98, Product: Hungarian Water Essence, Similarity: 0.25439691420316696
Index: 161, Product: Jasmine Green 

# collaborative filtering

In [29]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
reader = Reader()

In [30]:
data = pd.read_csv('skindataall.csv')

In [31]:
data.head()

Unnamed: 0.1,Unnamed: 0,Username,Skin_Tone,Skin_Type,Eye_Color,Hair_Color,Rating_Stars,Review,Product,Brand,...,Oily,Sensitive,Category,Product_Url,User_id,Product_id,Ingredients_Cleaned,Review_Cleaned,Good_Stuff,Ing_Tfidf
0,0,allyp3,Medium,Combination,Brown,Brunette,5,This is hands down the best cleanser I’ve ever...,Superfood Antioxidant Cleanser,YOUTH TO THE PEOPLE,...,0,0,Cleanser,https://www.sephora.com/product/kale-spinach-g...,3420,157,"['sodium', 'cocoyl', 'glutamate', 'cocamidopro...","['hand', 'best', 'cleanser', 'ever', 'used', '...",1,"sodium, cocoyl, glutamate, cocamidopropyl, bet..."
1,1,PatTea,Medium,Combination,Brown,Red,1,Unfortunately this doesn’t work for everyone. ...,Superfood Antioxidant Cleanser,YOUTH TO THE PEOPLE,...,0,0,Cleanser,https://www.sephora.com/product/kale-spinach-g...,2483,157,"['sodium', 'cocoyl', 'glutamate', 'cocamidopro...","['unfortunately', 'work', 'everyone', 'used', ...",0,"sodium, cocoyl, glutamate, cocamidopropyl, bet..."
2,2,Sabi1991,No data,No data,No data,No data,5,My favorite cleanser!! i love the packaging on...,Superfood Antioxidant Cleanser,YOUTH TO THE PEOPLE,...,0,0,Cleanser,https://www.sephora.com/product/kale-spinach-g...,2715,157,"['sodium', 'cocoyl', 'glutamate', 'cocamidopro...","['favorite', 'cleanser', 'love', 'packaging', ...",1,"sodium, cocoyl, glutamate, cocamidopropyl, bet..."
3,3,happyface2,Fair,Dry,Blue,Blonde,5,I love all things Youth To The People! This cl...,Superfood Antioxidant Cleanser,YOUTH TO THE PEOPLE,...,0,0,Cleanser,https://www.sephora.com/product/kale-spinach-g...,4497,157,"['sodium', 'cocoyl', 'glutamate', 'cocamidopro...","['love', 'thing', 'youth', 'people', 'cleanser...",1,"sodium, cocoyl, glutamate, cocamidopropyl, bet..."
4,4,kimkix34,Fair,Normal,Green,Blonde,5,I had a trial size of this and was obsessed. M...,Superfood Antioxidant Cleanser,YOUTH TO THE PEOPLE,...,0,0,Cleanser,https://www.sephora.com/product/kale-spinach-g...,5017,157,"['sodium', 'cocoyl', 'glutamate', 'cocamidopro...","['trial', 'size', 'wa', 'obsessed', 'skin', 'f...",1,"sodium, cocoyl, glutamate, cocamidopropyl, bet..."


In [32]:
columns_to_include = ['User_id','Product_id','Rating']
data = data[columns_to_include]

In [33]:
data.head()

Unnamed: 0,User_id,Product_id,Rating
0,3420,157,4.4
1,2483,157,4.4
2,2715,157,4.4
3,4497,157,4.4
4,5017,157,4.4


In [34]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data[['User_id', 'Product_id', 'Rating']], reader)

In [35]:
svd = SVD()
results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.1233  0.1135  0.1417  0.1139  0.1145  0.1214  0.0108  
MAE (testset)     0.0700  0.0659  0.0736  0.0671  0.0678  0.0689  0.0027  
Fit time          0.14    0.13    0.13    0.12    0.12    0.13    0.01    
Test time         0.02    0.01    0.02    0.01    0.02    0.01    0.00    


In [36]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x159d2ff6500>

In [37]:
svd.predict(3420, 157)

Prediction(uid=3420, iid=157, r_ui=None, est=4.487263555327925, details={'was_impossible': False})

In [48]:
df = pd.read_csv('skindataall.csv')
def get_top_ten(user_id):
    all_product_ids = df['Product_id'].unique()
    
    rated_product_ids = df[df['User_id'] == user_id]['Product_id'].unique()
    
    unrated_product_ids = [pid for pid in all_product_ids if pid not in rated_product_ids]
    
    predictions = []
    for pid in unrated_product_ids:
        pred = svd.predict(user_id, pid)
        predictions.append((pid, pred.est))
    
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    top_ten_product_ids = [pred[0] for pred in predictions[:10]]
    
    return top_ten_product_ids


user_id = 4709 
top_10_products = get_top_ten(user_id)

# Print the recommended products
#recommended_product_details = df[df['Product_id'].isin(top_10_products)][['Product_id', 'Product']].drop_duplicates()
filtered_products = df[df['Product_id'].isin(top_10_products)]

# Extract Product_id and Product columns as a list of tuples
products_list = filtered_products[['Product_id', 'Product']].values.tolist()
print(products_list)


[[117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial Cotton'], [117, 'Facial C

In [49]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Example dataset (replace with your actual dataset)
data = pd.read_csv('skindataall.csv')
data['combined_features'] = data.apply(lambda x: f"{x['Rating']}{x['Review']}", axis=1)
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['combined_features'])

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

def recommend_products(user_id, top_n=10):
    # Filter data for the target user
    user_data = data[data['User_id'] == user_id]    
    product_tfidf_scores = {}
    for index, row in user_data.iterrows():
        product_id = row['Product_id']
        tfidf_index = tfidf.vocabulary_.get(row['Review'].split()[0].lower(), -1)
        if tfidf_index != -1:
            product_tfidf_scores[product_id] = tfidf_matrix[index, tfidf_index]
    
    # Find similar users based on cosine similarity
    similar_users = []
    for index, row in data.iterrows():
        if row['User_id'] != user_id:
            similarity = cosine_sim[user_data.index, index].mean()
            similar_users.append((row['User_id'], similarity))
    
    # Sort similar users by similarity score in descending order
    similar_users = sorted(similar_users, key=lambda x: x[1], reverse=True)
    
    # Recommend top products based on similar users' ratings
    recommended_products = []
    for user, similarity in similar_users:
        user_data = data[data['User_id'] == user]
        for index, row in user_data.iterrows():
            if row['Product_id'] not in product_tfidf_scores:
                recommended_products.append((row['Product_id'], row['Product']))
            if len(recommended_products) >= top_n:
                break
        if len(recommended_products) >= top_n:
            break
    
    return recommended_products[:top_n]

# Example usage: Recommend products to User 4709
user_id = 4709
top_10_products = recommend_products(user_id, top_n=10)
print(f"Top 10 recommended products for User {user_id}:")
for product_id, product_name in top_10_products:
    print(f"Product ID: {product_id}, Product Name: {product_name}")


Top 10 recommended products for User 4709:
Product ID: 104, Product Name: Eudermine Revitalizing Essence
Product ID: 167, Product Name: Max Complexion Correction Pads
Product ID: 96, Product Name: Dramatically Different Moisturizing Lotion+
Product ID: 44, Product Name: Blotting Papers
Product ID: 182, Product Name: Invigorating Night Transformation™ Gel
Product ID: 182, Product Name: Invigorating Night Transformation™ Gel
Product ID: 126, Product Name: Find Your Balance™ Oil Control Cleanser
Product ID: 197, Product Name: Pore-Balance™ Facial Sauna Scrub
Product ID: 295, Product Name: Umbrian Clay Pore Purifying Face Mask
Product ID: 182, Product Name: Invigorating Night Transformation™ Gel


In [134]:
df.head()

Unnamed: 0,User_id,Product_id,Rating,Review,Sentiment,Combined
0,3420,157,4.4,This is hands down the best cleanser I’ve ever...,0.127662,3.118299
1,2483,157,4.4,Unfortunately this doesn’t work for everyone. ...,0.111111,3.113333
2,2715,157,4.4,My favorite cleanser!! i love the packaging on...,0.582812,3.254844
3,4497,157,4.4,I love all things Youth To The People! This cl...,0.259314,3.157794
4,5017,157,4.4,I had a trial size of this and was obsessed. M...,0.277778,3.163333


In [47]:
def get_top_ten(user_id):
    dff = pd.read_csv('skindataall.csv')
    all_product_ids = dff['Product_id'].unique()
    
    rated_product_ids = dff[dff['User_id'] == user_id]['Product_id'].unique()
    
    unrated_product_ids = [pid for pid in all_product_ids if pid not in rated_product_ids]
    
    predictions = []
    for pid in unrated_product_ids:
        pred = svd.predict(user_id, pid)
        predictions.append((pid, pred.est))
    
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    top_ten_product_ids = [pred[0] for pred in predictions[:10]]
    
    return top_ten_product_ids


user_id = 4709 
top_10_products = get_top_ten(user_id)

# Print the recommended products
#recommended_product_details = df[df['Product_id'].isin(top_10_products)][['Product_id', 'Product']].drop_duplicates()
print(top_10_products)


[97, 130, 117, 178, 109, 252, 292, 302, 70, 278]


In [52]:
# Load and preprocess the dataset
data = pd.read_csv('skindataall.csv')
df = data
reader = Reader()
columns_to_include = ['User_id','Product_id','Rating']
data = data[columns_to_include]
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data[['User_id', 'Product_id', 'Rating']], reader)
svd = SVD()
results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
trainset = data.build_full_trainset()
svd.fit(trainset)
def get_top_n(user_id,n):
    all_product_ids = df['Product_id'].unique()
    
    rated_product_ids = df[df['User_id'] == user_id]['Product_id'].unique()
    print(rated_product_ids)
    unrated_product_ids = [pid for pid in all_product_ids if pid not in rated_product_ids]
    
    predictions = []
    for pid in unrated_product_ids:
        pred = svd.predict(user_id, pid)
        predictions.append((pid, pred.est))
    
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    top_ten_product_ids = [pred[0] for pred in predictions[:n]]
    print(top_ten_product_ids)
    #filtered_products = df[df['Product_id'].isin(top_ten_product_ids)]
    #Extract Product_id and Product columns as a list of tuples
    #products_list = filtered_products[['Product_id', 'Product']].values.tolist()
    top_ten_product_ids = [int(idx) for idx in top_ten_product_ids]
    return top_ten_product_ids

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.1073  0.1118  0.1308  0.1359  0.1350  0.1242  0.0121  
MAE (testset)     0.0651  0.0691  0.0720  0.0730  0.0690  0.0696  0.0028  
Fit time          0.11    0.11    0.12    0.11    0.13    0.12    0.01    
Test time         0.02    0.01    0.01    0.02    0.01    0.01    0.00    


In [56]:
get_top_n(157,20)

[170]
[252, 108, 178, 97, 247, 116, 232, 130, 278, 294, 150, 117, 109, 266, 127, 87, 53, 272, 302, 180]


[252,
 108,
 178,
 97,
 247,
 116,
 232,
 130,
 278,
 294,
 150,
 117,
 109,
 266,
 127,
 87,
 53,
 272,
 302,
 180]