In [154]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn.metrics import accuracy_score, confusion_matrix

In [270]:
functions = pd.read_csv("function_description.csv")
functions["Function"] = functions["Function"].str.lower()
functions.head()

Unnamed: 0,Function,Description
0,abrasive,Removing unwanted tissue or foreign materials ...
1,absorbent,Taking up water- and/or oil-soluble (dissolved...
2,adhesive,Tending to unite/bind/bond surfaces together.
3,anti-seborrheic,Preventing or relieving the symptoms of seborr...
4,anti-sebum,Helping control sebum production.


In [309]:
ingredients = pd.read_csv("ingredient-function.csv")
ingredients.head()

Unnamed: 0,Ingredient,Function
0,secale cereale seed extract,abrasive
1,moroccan lava clay,abrasive
2,rosa rugosa bud powder,abrasive
3,orthoclase,abrasive
4,topaz,abrasive


In [311]:
function_to_group = {
    'abrasive': 'Cleansing and Exfoliating',
    'absorbent': 'Oil and Sebum Control',
    'adhesive': 'Structural Enhancement',
    'anti-seborrheic': 'Oil and Sebum Control',
    'anti-sebum': 'Oil and Sebum Control',
    'anticaking': 'Product Texture Enhancement',
    'anticorrosive': 'Preservation and Stability',
    'antifoaming': 'Product Texture Enhancement',
    'antimicrobial': 'Skin Protection and Purification',
    'antioxidant': 'Skin Protection and Vitality',
    'antiperspirant': 'Sweat and Odor Control',
    'antiplaque': 'Oral Care',
    'antistatic': 'Frizz Control and Manageability',
    'astringent': 'Skin Pore Tightening',
    'binding': 'Product Integrity and Cohesion',
    'bleaching': 'Skin Brightening and Lightening',
    'buffering': 'pH Balance and Stability',
    'bulking': 'Product Volume and Fullness',
    'chelating': 'Metal Ion Control',
    'cleansing': 'Gentle Cleansing',
    'colorant': 'Color Enhancement and Pigmentation',
    'denaturant': 'Preservation and Stability',
    'deodorant': 'Odor Control',
    'depilatory': 'Hair Removal',
    'detangling': 'Hair Detangling',
    'dispersing non-surfactant': 'Product Dispersion Enhancement',
    'emulsion stabilising': 'Product Stability and Emulsification',
    'epilating': 'Hair Removal',
    'exfoliating': 'Gentle Exfoliation',
    'eyelash conditioning': 'Eyelash Care',
    'film forming': 'Skin Barrier Protection',
    'foaming': 'Rich Foaming',
    'fragrance': 'Scent and Sensory Experience',
    'gel forming': 'Gel Formation and Texture',
    'hair conditioning': 'Hair Nourishment and Conditioning',
    'hair dyeing': 'Hair Coloration',
    'hair fixing': 'Hair Styling and Fixation',
    'hair waving or straightening': 'Hair Texture Transformation',
    'humectant': 'Moisture Retention and Hydration',
    'keratolytic': 'Skin Exfoliation',
    'light stabilizer': 'UV Protection and Light Stability',
    'lytic': 'Skin Purification',
    'moisturising': 'Intensive Hydration',
    'nail conditioning': 'Nail Care and Conditioning',
    'nail sculpting': 'Nail Sculpting and Enhancement',
    'not reported': 'Other Functions',
    'opacifying': 'Opacity Enhancement',
    'oral care': 'Oral Hygiene',
    'oxidising': 'Product Oxidation Control',
    'pearlescent': 'Luminosity Enhancement',
    'perfuming': 'Fragrance and Sensory Experience',
    'plasticiser': 'Product Flexibility and Plasticity',
    'preservative': 'Product Preservation',
    'propellant': 'Product Dispensing',
    'reducing': 'Product Stability and Reduction',
    'refatting': 'Skin Nourishment and Moisturization',
    'refreshing': 'Skin Refreshment and Vitality',
    'skin conditioning': 'Skin Nourishment and Conditioning',
    'skin conditioning - emollient': 'Skin Emollience and Softening',
    'skin conditioning - humectant': 'Skin Hydration and Moisture Retention',
    'skin conditioning - occlusive': 'Skin Barrier Protection and Moisture Lock',
    'skin conditioning - miscellaneous': 'Skin Nourishment and Miscellaneous',
    'skin protecting': 'Skin Protection and Cleansing',
    'slip modifier': 'Product Spreadability and Slip',
    'smoothing': 'Skin Smoothing and Texture Refinement',
    'solvent': 'Product Solubility and Dissolution',
    'soothing': 'Skin Soothing and Calming',
    'surface modifier': 'Product Surface Modification',
    'surfactant - cleansing': 'Gentle Cleansing',
    'surfactant - dispersing': 'Product Dispersion Enhancement',
    'surfactant - emulsifying': 'Emulsion Formation and Stability',
    'surfactant - foam boosting': 'Rich Foaming',
    'surfactant - hydrotrope': 'Product Solubilization and Dispersion',
    'surfactant - solubilizing': 'Product Solubilization and Dispersion',
    'tanning': 'Skin Tanning and Glow',
    'tonic': 'Skin Toning and Refreshment',
    'uv absorber': 'UV Protection and Absorption',
    'uv filter': 'UV Protection and Filtering',
    'viscosity controlling': 'Product Viscosity Control',
    'ph adjusters': 'pH Balance and Stability',
}


ingredients["Group"] = ingredients["Function"].map(function_to_group)
ingredients.head()
ingredients.to_csv("ingredients.csv",index=False)

In [294]:
group_mapping = {label: i for i, label in enumerate(ingredients['Group'].unique())}
group_mapping

{'Cleansing and Purifying': 0,
 'Structural & Textural': 1,
 'Stability': 2,
 'Appearance Enhancement': 3,
 'Oral Care': 4,
 'Sensory Experience': 5,
 'Hair Care': 6,
 'Hydration and Moisturization': 7,
 'Other': 8}

In [295]:
ingredients["Label"] = ingredients["Group"].map(group_mapping)

In [285]:
#find and remove small classes
reverse = {value:key for key,value in group_mapping.items()}
ingredients = ingredients[~ingredients["Label"].isin(smallest_groups)]
ingredients["Label"].nunique()

78

# Data Preprocessing

In [296]:
nltk.download('punkt')
nltk.download('wordnet')
cleanedData = []
lemma = WordNetLemmatizer()
for text in ingredients['Ingredient']:
  text = re.sub("[^a-zA-Z0-9]", " ", text)
  #tokenize and lemmatize
  text = word_tokenize(text)
  text = [lemma.lemmatize(word) for word in text]
  #joining
  text = " ".join(text)
  cleanedData.append(text)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/irenenguyen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/irenenguyen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [297]:
vectorizer = CountVectorizer()
BOW = vectorizer.fit_transform(cleanedData)

In [298]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(BOW,np.asarray(ingredients["Label"]), stratify=ingredients["Label"])
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(37923, 11844)
(37923,)
(12642, 11844)
(12642,)


In [299]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
model = SVC(class_weight='balanced')
ensemble = BaggingClassifier(base_estimator=model,n_estimators=10)
ensemble.fit(x_train,y_train)



In [300]:
y_pred =ensemble.predict(x_test)
print("Test accuracy:",accuracy_score(y_test, y_pred))

Test accuracy: 0.43727258345198544


# Logistic Regression

In [291]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(x_train, y_train)
score = classifier.score(x_test, y_test)
print("Accuracy:", score)


Accuracy: 0.2901439645625692


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [214]:
from scipy.sparse import hstack
v = CountVectorizer()
x2 = v.fit_transform(ingredients.Function)
combined = hstack((BOW,x2))
x_train,x_test,y_train,y_test = train_test_split(combined,np.asarray(ingredients["Label"]), stratify=ingredients["Label"])
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(37925, 11937)
(37925,)
(12642, 11937)
(12642,)


In [215]:
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
score = classifier.score(x_test, y_test)
print("Accuracy:", score)

Accuracy: 0.9998417971839899


# Keras 

In [175]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers,losses

In [301]:
tokenizer = Tokenizer(num_words=30000)
tokenizer.fit_on_texts(ingredients.Ingredient)

In [302]:
train_df,test_df = train_test_split(ingredients, test_size=0.2,stratify=ingredients["Label"])
X_train = tokenizer.texts_to_sequences(train_df.Ingredient)
X_test = tokenizer.texts_to_sequences(test_df.Ingredient)
y_train = train_df.Label
y_test = test_df.Label

In [303]:
vocab_size = len(tokenizer.word_index)+1

In [304]:
maxlen=10
X_train = pad_sequences(X_train,padding='post',maxlen=maxlen)
X_test = pad_sequences(X_test,padding='post',maxlen=maxlen)

In [306]:
embedding_dim = 100
model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size,output_dim=embedding_dim,input_length=maxlen, trainable=True))
model.add(layers.Conv1D(128,5,activation='relu'))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(32,activation='relu'))
model.add(layers.Dense(9,activation='softmax'))
model.compile(optimizer='adam', loss=losses.SparseCategoricalCrossentropy(from_logits=True),metrics=['accuracy'])

In [307]:
history=model.fit(X_train,y_train,epochs=20,verbose=True,validation_data=(X_test,y_test),batch_size=16)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Epoch 1/20


  output, from_logits = _get_logits(


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training Accuracy: 0.7047
Testing Accuracy:  0.4031
