# 1. Importing basic libraries and the dataset

In [24]:
# Basic libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Load the dataset
df = pd.read_csv('most_used_beauty_cosmetics_products_extended.csv')
print(df.head(5))

       Product_Name              Brand        Category Usage_Frequency  \
0   Ultra Face Mask     Drunk Elephant           Blush          Weekly   
1    Ultra Lipstick      Laura Mercier  Makeup Remover      Occasional   
2       Ultra Serum     Natasha Denona     Highlighter           Daily   
3      Divine Serum        Ilia Beauty       Face Mask      Occasional   
4  Super Foundation  Charlotte Tilbury     Highlighter      Occasional   

   Price_USD  Rating  Number_of_Reviews Product_Size  Skin_Type Gender_Target  \
0      67.85     1.4                686         30ml  Sensitive        Female   
1     116.43     4.2               5483        250ml        Dry        Unisex   
2      90.84     1.6               5039        100ml  Sensitive          Male   
3      55.17     3.2               6202        250ml     Normal          Male   
4     140.56     1.7                297        100ml       Oily        Female   

  Packaging_Type Main_Ingredient  Cruelty_Free Country_of_Origin  
0

In [4]:
# Check the datatypes of each column
df.dtypes

Product_Name          object
Brand                 object
Category              object
Usage_Frequency       object
Price_USD            float64
Rating               float64
Number_of_Reviews      int64
Product_Size          object
Skin_Type             object
Gender_Target         object
Packaging_Type        object
Main_Ingredient       object
Cruelty_Free            bool
Country_of_Origin     object
dtype: object

# 2. Data cleaning

In [5]:
# Remove unnecessary columns
df = df.drop(columns=['Product_Name', 'Packaging_Type', 'Main_Ingredient', 'Country_of_Origin'])
# Remove 'ml' from the back of each product_size entry
df['Product_Size'] = df['Product_Size'].str.replace('ml', '').astype(float)

In [6]:
# Check dataset structure
print(df.head(5))

               Brand        Category Usage_Frequency  Price_USD  Rating  \
0     Drunk Elephant           Blush          Weekly      67.85     1.4   
1      Laura Mercier  Makeup Remover      Occasional     116.43     4.2   
2     Natasha Denona     Highlighter           Daily      90.84     1.6   
3        Ilia Beauty       Face Mask      Occasional      55.17     3.2   
4  Charlotte Tilbury     Highlighter      Occasional     140.56     1.7   

   Number_of_Reviews  Product_Size  Skin_Type Gender_Target  Cruelty_Free  
0                686          30.0  Sensitive        Female         False  
1               5483         250.0        Dry        Unisex         False  
2               5039         100.0  Sensitive          Male          True  
3               6202         250.0     Normal          Male          True  
4                297         100.0       Oily        Female         False  


# 3. Converting categorical values into numerical values with label encoding and one-hot encoding

In [7]:
# Separate label encoding for the Brand column (target)
label_encoder_brand = LabelEncoder()
df['Brand'] = label_encoder_brand.fit_transform(df['Brand'])

# One-hot encoding for categorical columns (inputs)
categorical_cols = ['Category', 'Usage_Frequency', 'Skin_Type', 'Gender_Target', 'Cruelty_Free']
df = pd.get_dummies(df, columns=categorical_cols)

In [8]:
# Check dataset structure to confirm that one-hot encoding has been applied
print(df.head(5))

   Brand  Price_USD  Rating  Number_of_Reviews  Product_Size  \
0      9      67.85     1.4                686          30.0   
1     22     116.43     4.2               5483         250.0   
2     27      90.84     1.6               5039         100.0   
3     17      55.17     3.2               6202         250.0   
4      5     140.56     1.7                297         100.0   

   Category_BB Cream  Category_Blush  Category_Bronzer  Category_CC Cream  \
0                  0               1                 0                  0   
1                  0               0                 0                  0   
2                  0               0                 0                  0   
3                  0               0                 0                  0   
4                  0               0                 0                  0   

   Category_Cleanser  ...  Skin_Type_Combination  Skin_Type_Dry  \
0                  0  ...                      0              0   
1                 

In [9]:
# Check that the data types had been converted to numerical
df.dtypes

Brand                           int32
Price_USD                     float64
Rating                        float64
Number_of_Reviews               int64
Product_Size                  float64
Category_BB Cream               uint8
Category_Blush                  uint8
Category_Bronzer                uint8
Category_CC Cream               uint8
Category_Cleanser               uint8
Category_Concealer              uint8
Category_Contour                uint8
Category_Exfoliator             uint8
Category_Eye Shadow             uint8
Category_Eyeliner               uint8
Category_Face Mask              uint8
Category_Face Oil               uint8
Category_Foundation             uint8
Category_Highlighter            uint8
Category_Lip Gloss              uint8
Category_Lip Liner              uint8
Category_Lipstick               uint8
Category_Makeup Remover         uint8
Category_Mascara                uint8
Category_Moisturizer            uint8
Category_Powder                 uint8
Category_Pri

# 4. Training the model

In [10]:
# Features (X) and Target (y)
X = df.drop(columns=['Brand'])
y = df['Brand']

In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
# Step 1: Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

In [13]:
# Step 2: Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(random_state=30)

In [14]:
# Step 3: Set up GridSearchCV with cross-validation
grid_search = GridSearchCV(estimator=rf_model, 
                           param_grid=param_grid, 
                           cv=5,  # 5-fold cross-validation
                           n_jobs=-1,  # Use all available CPU cores
                           verbose=2,  # Display progress during the grid search
                           scoring='accuracy')  # Optimize for accuracy

In [15]:
# Step 4: Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=30), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'max_depth': [None, 10, 20, 30],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [50, 100, 200]},
             scoring='accuracy', verbose=2)

In [16]:
# Step 5: Output the best parameters found during the grid search
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'bootstrap': False, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}


In [17]:
# Step 6: Make predictions on the test set using the best estimator
best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_test)

# 5. Entering user inputs to obtain recommended brand to buy

In [19]:
# Sample input data (before encoding)
sample_input = {
    'Category': 'Blush',
    'Usage_Frequency': 'Daily',
    'Price_USD': 25.99,
    'Rating': 4.5,
    'Number_of_Reviews': 1000,
    'Product_Size': 100,  # in ml, already converted to numeric
    'Skin_Type': 'Oily',
    'Gender_Target': 'Male',
    'Cruelty_Free': 'True'
}

print(sample_input)

{'Category': 'Blush', 'Usage_Frequency': 'Daily', 'Price_USD': 25.99, 'Rating': 4.5, 'Number_of_Reviews': 1000, 'Product_Size': 100, 'Skin_Type': 'Oily', 'Gender_Target': 'Male', 'Cruelty_Free': 'True'}


In [20]:
# Convert sample_input into a DataFrame
input_df = pd.DataFrame([sample_input])

# Apply the same one-hot encoding to the input data
input_df = pd.get_dummies(input_df)

# Align input_df with the training data (ensure columns match)
input_df = input_df.reindex(columns=X.columns, fill_value=0)

# Select the same feature columns as the ones used for training
X_new = input_df

# Make a prediction with the best model
brand_prediction = best_rf_model.predict(X_new)

# Convert predicted brand back to categorical
predicted_brand_categorical = label_encoder_brand.inverse_transform([brand_prediction[0]])
print("Predicted Brand (Categorical):", predicted_brand_categorical[0])

Predicted Brand (Categorical): Danessa Myricks


### Therefore with the given sample inputs, the predicted brand to purchase from is Danessa Myricks.


# 6. Analyzing the data

In [22]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.04      0.02      0.02       122
           1       0.02      0.04      0.03       104
           2       0.03      0.03      0.03       127
           3       0.04      0.04      0.04       110
           4       0.02      0.01      0.01       104
           5       0.05      0.02      0.03       122
           6       0.04      0.02      0.03       121
           7       0.02      0.01      0.01        96
           8       0.04      0.04      0.04        95
           9       0.00      0.00      0.00       118
          10       0.02      0.02      0.02       132
          11       0.02      0.02      0.02       123
          12       0.03      0.02      0.02       115
          13       0.01      0.01      0.01       116
          14       0.01      0.01      0.01       110
          15       0.05      0.04      0.05       120
          16       0.03      0.01      0.01       109
          17       0.02    

### The scores are extremely low based on the classification report, signifying a serious problem with the random forest model. In order to find out the issue, we would use the cosine similarity algorithm.

In [25]:
# Assuming X is our feature matrix and y is our target variable
y = df["Brand"]
unique_labels = np.unique(y)
class_features = [X[y == label].mean(axis=0) for label in unique_labels]
similarity_matrix = cosine_similarity(class_features)

In [26]:
similarity_matrix

array([[1.        , 0.99999989, 0.99999628, ..., 0.99999955, 0.99999971,
        0.99999833],
       [0.99999989, 1.        , 0.99999745, ..., 0.99999903, 0.99999925,
        0.99999744],
       [0.99999628, 0.99999745, 1.        , ..., 0.99999373, 0.99999407,
        0.99999067],
       ...,
       [0.99999955, 0.99999903, 0.99999373, ..., 1.        , 0.99999996,
        0.9999996 ],
       [0.99999971, 0.99999925, 0.99999407, ..., 0.99999996, 1.        ,
        0.99999936],
       [0.99999833, 0.99999744, 0.99999067, ..., 0.9999996 , 0.99999936,
        1.        ]])

### By using the cosine similarity algorithm, we see that the scores are low because the classes are too highly correlated and the model is unable to distinguish between them. As such, using a random forest model is not suitable for this data set.