# Random Forest

In [20]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, multilabel_confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


## Load Files

In [21]:
df = pd.read_csv('/Users/hannah-ann/PycharmProjects/cosmetic-ingredient-classifier-app/data/cleaned/cleaned_dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 763 entries, 0 to 762
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Ingredients  763 non-null    object
 1   Combination  763 non-null    int64 
 2   Dry          763 non-null    int64 
 3   Normal       763 non-null    int64 
 4   Oily         763 non-null    int64 
 5   Sensitive    763 non-null    int64 
dtypes: int64(5), object(1)
memory usage: 35.9+ KB


## Prepare Data - Feature Extraction Vectorise

In [22]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Ingredients'])
y = df[['Combination', 'Dry', 'Normal', 'Oily', 'Sensitive']]

## Split Data

In [23]:
# Split the data (80% training, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=36)


## Train Classifier Model

In [24]:
# Create the Random Forest Classifier model
model = make_pipeline(
    StandardScaler(with_mean=False),
    RandomForestClassifier(random_state=42, class_weight='balanced'))
# Train the model on the training data
model.fit(X_train, y_train)

# k-FOLD CORSS VALIDATION
scores = cross_val_score(model, X, y, cv=5)

print("Cross validation :",scores)
average_accuracy = np.mean(scores)
print("Average accuracy :",scores.mean())

Cross validation : [0.58823529 0.62745098 0.67973856 0.76315789 0.69078947]
Average accuracy : 0.669874441004472


In [25]:
model.score(X_test, y_test)

0.7320261437908496

## Evaluation Metric

In [26]:
from sklearn.metrics import precision_score, recall_score ,multilabel_confusion_matrix

# predictions on the test set
y_pred = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred,normalize=True)
print(f"Accuracy: {accuracy}")

# Calculate the precision of the model
precision = precision_score(y_test, y_pred,average='weighted')
print(f"Precision: {precision}")

# Calculate to recall fo the model07986r-00-
recall = recall_score(y_test, y_pred,average='micro')

print(f"Recall: {recall}")


# multi label confusion matrix
confusion_matrix = multilabel_confusion_matrix(y_test, y_pred)
print(confusion_matrix)

report = classification_report(y_test, y_pred,target_names=['Combination', 'Dry', 'Normal', 'Oily', 'Sensitive'])
print(report)


Accuracy: 0.7320261437908496
Precision: 0.9257626745175361
Recall: 0.9942938659058488
[[[  0   2]
  [  0 151]]

 [[  1  11]
  [  1 140]]

 [[  1   7]
  [  0 145]]

 [[  0  10]
  [  1 142]]

 [[  3  29]
  [  2 119]]]
              precision    recall  f1-score   support

 Combination       0.99      1.00      0.99       151
         Dry       0.93      0.99      0.96       141
      Normal       0.95      1.00      0.98       145
        Oily       0.93      0.99      0.96       143
   Sensitive       0.80      0.98      0.88       121

   micro avg       0.92      0.99      0.96       701
   macro avg       0.92      0.99      0.96       701
weighted avg       0.93      0.99      0.96       701
 samples avg       0.92      0.99      0.95       701

