In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, multilabel_confusion_matrix

# Load Files

In [2]:
df = pd.read_csv('/Users/hannah-ann/PycharmProjects/cosmetic-ingredient-classifier-app/data/cleaned/cleaned_dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 763 entries, 0 to 762
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Ingredients  763 non-null    object
 1   Combination  763 non-null    int64 
 2   Dry          763 non-null    int64 
 3   Normal       763 non-null    int64 
 4   Oily         763 non-null    int64 
 5   Sensitive    763 non-null    int64 
dtypes: int64(5), object(1)
memory usage: 35.9+ KB


# Prepare Data - Vectorisation

In [3]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Ingredients'])
y = df[['Combination', 'Dry', 'Normal', 'Oily', 'Sensitive']]

# Split

In [4]:
# Split the data (80% training, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=36)


# Train

In [5]:
model = KNeighborsClassifier(n_neighbors=9)
model.fit(X_train, y_train)

# Predictions


In [6]:
y_pred = model.predict(X_test)

# Evaluation

In [7]:
from sklearn.metrics import precision_score, recall_score ,multilabel_confusion_matrix
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred,normalize=True)
print(f"Accuracy: {accuracy}")

# Calculate the precision of the model
precision = precision_score(y_test, y_pred,average='weighted')
print(f"Precision: {precision}")

# Calculate to recall fo the model07986r-00-
recall = recall_score(y_test, y_pred,average='micro')

print(f"Recall: {recall}")


# multi label confusion matrix
confusion_matrix = multilabel_confusion_matrix(y_test, y_pred)
print(confusion_matrix)

report = classification_report(y_test, y_pred,target_names=['Combination', 'Dry', 'Normal', 'Oily', 'Sensitive'])
print(report)

Accuracy: 0.7254901960784313
Precision: 0.9280902638314649
Recall: 0.9942938659058488
[[[  0   2]
  [  0 151]]

 [[  1  11]
  [  0 141]]

 [[  0   8]
  [  0 145]]

 [[  0  10]
  [  0 143]]

 [[  7  25]
  [  4 117]]]
              precision    recall  f1-score   support

 Combination       0.99      1.00      0.99       151
         Dry       0.93      1.00      0.96       141
      Normal       0.95      1.00      0.97       145
        Oily       0.93      1.00      0.97       143
   Sensitive       0.82      0.97      0.89       121

   micro avg       0.93      0.99      0.96       701
   macro avg       0.92      0.99      0.96       701
weighted avg       0.93      0.99      0.96       701
 samples avg       0.92      0.99      0.95       701

