In [182]:
import cv2
import easyocr
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import plotly.express as px
from joblib import dump, load

In [144]:
#read image
image_path = "./label2.jpg"

img = cv2.imread(image_path)

In [145]:
#create instance of text detector
reader = easyocr.Reader(['en'],gpu=False)

Using CPU. Note: This module is much faster with a GPU.


In [151]:
#detect text
text = reader.readtext(img)

In [188]:
user_ingredients = []
for t in text:
  user_ingredients.append(t[1])
user_ingredients = [i.lower() for i in user_ingredients]
user_ingredients

['ingredientshingredients : waterzeau, dimethicone, glycerin',
 'tapioca starch; dimethicone crosspolymer; retinol, retinyl',
 'propionate, niacinamide*; palmitoyl pentapeptide-4**',
 'polyacrylamide, c13-14 isoparaffin , dmdm hydantoin,',
 'polysorbate 20, laureth-4,acrylatesicio-30 alkyl acrylate',
 'crosspolymer, laureth-7 , dimethiconol,',
 'polymethylsilsesquioxane, aminomethyl propanol, titanium',
 'dioxide, disodium edta iodopropvnyl butylcarbamate']

In [191]:
str = " ".join(user_ingredients)
result = re.split(r"[,:;]", str)
result

['ingredientshingredients ',
 ' waterzeau',
 ' dimethicone',
 ' glycerin tapioca starch',
 ' dimethicone crosspolymer',
 ' retinol',
 ' retinyl propionate',
 ' niacinamide*',
 ' palmitoyl pentapeptide-4** polyacrylamide',
 ' c13-14 isoparaffin ',
 ' dmdm hydantoin',
 ' polysorbate 20',
 ' laureth-4',
 'acrylatesicio-30 alkyl acrylate crosspolymer',
 ' laureth-7 ',
 ' dimethiconol',
 ' polymethylsilsesquioxane',
 ' aminomethyl propanol',
 ' titanium dioxide',
 ' disodium edta iodopropvnyl butylcarbamate']

In [183]:
ingredients = pd.read_csv("ingredients.csv")

In [192]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf_matrix = vectorizer.fit_transform(ingredients["Ingredient"])
dump(vectorizer,'vectorizer.joblib')
dump(tfidf_matrix,'tfidf_matrix.joblib')

['tfidf_matrix.joblib']

In [194]:
user_vectors = vectorizer.transform(result)
similarity_scores = cosine_similarity(user_vectors,tfidf_matrix)
most_similar_indices = similarity_scores.argmax(axis=1)
matching_ingredients = ingredients.iloc[most_similar_indices]["Ingredient"].tolist()
final_ing = set(matching_ingredients)

In [195]:
final_ing

{'acrylates/c10-30 alkyl acrylate crosspolymer',
 'aminomethyl propanol',
 'c13-14 isoparaffin',
 'dimethicone',
 'dimethicone crosspolymer',
 'dimethiconol',
 'disodium edta',
 'dmdm hydantoin',
 'laureth-4',
 'niacinamide',
 'palmitoyl pentapeptide-4',
 'polymethylsilsesquioxane',
 'polysorbate 20',
 'retinol',
 'retinyl propionate',
 'secale cereale seed extract',
 'tapioca starch',
 'titanium dioxide'}

In [177]:
ingredients["Group"]

0        Cleansing and Exfoliating
1        Cleansing and Exfoliating
2        Cleansing and Exfoliating
3        Cleansing and Exfoliating
4        Cleansing and Exfoliating
                   ...            
50562     pH Balance and Stability
50563     pH Balance and Stability
50564     pH Balance and Stability
50565     pH Balance and Stability
50566     pH Balance and Stability
Name: Group, Length: 50567, dtype: object

In [172]:
function_to_group = {
    'abrasive': 'Cleansing and Exfoliating',
    'absorbent': 'Oil and Sebum Control',
    'adhesive': 'Structural Enhancement',
    'anti-seborrheic': 'Oil and Sebum Control',
    'anti-sebum': 'Oil and Sebum Control',
    'anticaking': 'Product Texture Enhancement',
    'anticorrosive': 'Preservation and Stability',
    'antifoaming': 'Product Texture Enhancement',
    'antimicrobial': 'Skin Protection and Purification',
    'antioxidant': 'Skin Protection and Vitality',
    'antiperspirant': 'Sweat and Odor Control',
    'antiplaque': 'Oral Care',
    'antistatic': 'Frizz Control and Manageability',
    'astringent': 'Skin Pore Tightening',
    'binding': 'Product Integrity and Cohesion',
    'bleaching': 'Skin Brightening and Lightening',
    'buffering': 'pH Balance and Stability',
    'bulking': 'Product Volume and Fullness',
    'chelating': 'Metal Ion Control',
    'cleansing': 'Gentle Cleansing',
    'colorant': 'Color Enhancement and Pigmentation',
    'denaturant': 'Preservation and Stability',
    'deodorant': 'Odor Control',
    'depilatory': 'Hair Removal',
    'detangling': 'Hair Detangling',
    'dispersing non-surfactant': 'Product Dispersion Enhancement',
    'emulsion stabilising': 'Product Stability and Emulsification',
    'epilating': 'Hair Removal',
    'exfoliating': 'Gentle Exfoliation',
    'eyelash conditioning': 'Eyelash Care',
    'film forming': 'Skin Barrier Protection',
    'foaming': 'Rich Foaming',
    'fragrance': 'Scent and Sensory Experience',
    'gel forming': 'Gel Formation and Texture',
    'hair conditioning': 'Hair Nourishment and Conditioning',
    'hair dyeing': 'Hair Coloration',
    'hair fixing': 'Hair Styling and Fixation',
    'hair waving or straightening': 'Hair Texture Transformation',
    'humectant': 'Moisture Retention and Hydration',
    'keratolytic': 'Skin Exfoliation',
    'light stabilizer': 'UV Protection and Light Stability',
    'lytic': 'Skin Purification',
    'moisturising': 'Intensive Hydration',
    'nail conditioning': 'Nail Care and Conditioning',
    'nail sculpting': 'Nail Sculpting and Enhancement',
    'not reported': 'Other Functions',
    'opacifying': 'Opacity Enhancement',
    'oral care': 'Oral Hygiene',
    'oxidising': 'Product Oxidation Control',
    'pearlescent': 'Luminosity Enhancement',
    'perfuming': 'Fragrance and Sensory Experience',
    'plasticiser': 'Product Flexibility and Plasticity',
    'preservative': 'Product Preservation',
    'propellant': 'Product Dispensing',
    'reducing': 'Product Stability and Reduction',
    'refatting': 'Skin Nourishment and Moisturization',
    'refreshing': 'Skin Refreshment and Vitality',
    'skin conditioning': 'Skin Nourishment and Conditioning',
    'skin conditioning - emollient': 'Skin Emollience and Softening',
    'skin conditioning - humectant': 'Skin Hydration and Moisture Retention',
    'skin conditioning - occlusive': 'Skin Barrier Protection and Moisture Lock',
    'skin conditioning - miscellaneous': 'Skin Nourishment and Miscellaneous',
    'skin protecting': 'Skin Protection and Cleansing',
    'slip modifier': 'Product Spreadability and Slip',
    'smoothing': 'Skin Smoothing and Texture Refinement',
    'solvent': 'Product Solubility and Dissolution',
    'soothing': 'Skin Soothing and Calming',
    'surface modifier': 'Product Surface Modification',
    'surfactant - cleansing': 'Gentle Cleansing',
    'surfactant - dispersing': 'Product Dispersion Enhancement',
    'surfactant - emulsifying': 'Emulsion Formation and Stability',
    'surfactant - foam boosting': 'Rich Foaming',
    'surfactant - hydrotrope': 'Product Solubilization and Dispersion',
    'surfactant - solubilizing': 'Product Solubilization and Dispersion',
    'tanning': 'Skin Tanning and Glow',
    'tonic': 'Skin Toning and Refreshment',
    'uv absorber': 'UV Protection and Absorption',
    'uv filter': 'UV Protection and Filtering',
    'viscosity controlling': 'Product Viscosity Control',
    'ph adjusters': 'pH Balance and Stability',
}

In [178]:
def calc_proportions(lst):
    total = len(lst)
    groups = [function_to_group[ingredients.loc[ingredients["Ingredient"] == i, "Function"].values[0]] for i in lst]
    freqs = {}
    for g in groups:
        if g not in freqs:
            freqs[g] = 1
        else:
            freqs[g]+=1
    res_dict = {key:round(value/total,2)for key,value in freqs.items()}
    res = {"Function":list(res_dict.keys()),"Proportion":list(res_dict.values())}
    final = pd.DataFrame(res).sort_values("Proportion",ascending=False)
    return final

In [179]:
props = calc_proportions(final_ing)
props

Unnamed: 0,Function,Proportion
7,Skin Nourishment and Conditioning,0.17
0,Product Texture Enhancement,0.11
12,Product Stability and Emulsification,0.11
1,Gentle Cleansing,0.06
2,Metal Ion Control,0.06
3,Color Enhancement and Pigmentation,0.06
4,Opacity Enhancement,0.06
5,Cleansing and Exfoliating,0.06
6,pH Balance and Stability,0.06
8,Skin Smoothing and Texture Refinement,0.06


In [180]:
fig = px.pie(props,values='Proportion',names='Function')
fig.show()