## Import libraries

In [118]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

## Clean data

In [172]:
df = pd.read_csv('body-care-1.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 666 entries, 0 to 665
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Label        666 non-null    object
 1   URL          666 non-null    object
 2   brand        666 non-null    object
 3   name         666 non-null    object
 4   price        666 non-null    object
 5   skin_type    666 non-null    object
 6   ingredients  661 non-null    object
dtypes: object(7)
memory usage: 36.5+ KB


In [173]:
df.head(15)

Unnamed: 0,Label,URL,brand,name,price,skin_type,ingredients
0,bath-and-body-soap,https://www.sephora.com/product/kp-bump-eraser...,FIRST AID BEAUTY,KP Bump Eraser Body Scrub with 10% AHA,$28.00,"['Normal,', 'Dry,', 'Combination\n']",-Pumice Buffing Beads: Exfoliate particles to ...
1,bath-and-body-soap,https://www.sephora.com/product/sol-de-janeiro...,SOL DE JANEIRO,Bum Bum Body Scrub,$42.00,"['Normal,', 'Dry,', 'Combination,', 'Oily\n']",-Crushed Cupuaçu Seeds: Gently buff and smooth...
2,bath-and-body-soap,https://www.sephora.com/product/brazilian-4-pl...,SOL DE JANEIRO,Brazilian 4 Play Moisturizing Shower Cream-Gel,$25.00,"['Dry,', 'Normal,', 'Combination,', 'Oily,']",
3,bath-and-body-soap,https://www.sephora.com/product/sol-de-janeiro...,SOL DE JANEIRO,Brazilian Touch Hand Sanitizer Spray,$10.00,[],"Alcohol 80%, Aqua (water), glycerine, hydrogen..."
4,bath-and-body-soap,https://www.sephora.com/product/necessaire-the...,NÉCESSAIRE,The Body Wash,$25.00,"['Normal,', 'Dry,', 'Combination,', 'Oily\n']","-Niacinamide (Vitamin B3): Cleanses, nourishes..."
5,bath-and-body-soap,https://www.sephora.com/product/resurface-glyc...,SKINFIX,Resurface+ Glycolic Renewing Body Scrub,$30.00,[],-Glycolic Acid: Helps slough off dead skin cel...
6,bath-and-body-soap,https://www.sephora.com/product/scalp-body-scr...,OUAI,Scalp & Body Scrub,$38.00,[],-Sugar: Unclogs and gently exfoliates and cond...
7,bath-and-body-soap,https://www.sephora.com/product/coco-rose-coco...,HERBIVORE,Coco Rose Exfoliating Body Scrub,$36.00,"['Normal\n', 'Oily\n', 'Combination\n', 'Dry\n']",-Virgin Coconut Oil: Provides intensive hydrat...
8,bath-and-body-soap,https://www.sephora.com/product/brown-sugar-bo...,FRESH,Brown Sugar Body Polish Exfoliator,$67.00,"['Normal,', 'Dry,', 'Combination,', 'Oily\n']",-Real Brown Sugar Crystals: Act as natural hum...
9,bath-and-body-soap,https://www.sephora.com/product/cleansing-soft...,L'OCCITANE,Cleansing And Softening Shower Oil With Almond...,$25.00,[],-Almond Proteins\n-Almond Oil: Naturally rich ...


In [174]:
# label
# ['bath-and-body-soap', 'body-moisturizers', 'sun-lotion', 'body-care', 'beauty-supplements-bath-body']

df.Label[df['Label'] == 'bath-and-body-soap'] = str('soap')
df.Label[df['Label'] == 'body-moisturizers'] = str('moisturizers')
df.Label[df['Label'] == 'sun-lotion'] = str('SPF')
df.Label[df['Label'] == 'body-care'] = str('body')
df.Label[df['Label'] == 'beauty-supplements-bath-body'] = str('supplements')

In [175]:
df['Label'].value_counts()

moisturizers    204
SPF             180
soap            119
supplements      98
body             65
Name: Label, dtype: int64

In [176]:
# remove duplicated items
df2 = df['name'].drop_duplicates()
df = df.loc[df2.index, :].reset_index()

# .loc accesses a group of rows and columns by a label or a boolean array
# reset_index() resets the index from 0 to length of data 

In [177]:
# URL
df.drop(['URL','index'], axis = 1, inplace = True)

In [178]:
df = df.loc[pd.notnull(df['ingredients'])]
# removed null ingredient items 

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 638 entries, 0 to 642
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Label        638 non-null    object
 1   brand        638 non-null    object
 2   name         638 non-null    object
 3   price        638 non-null    object
 4   skin_type    638 non-null    object
 5   ingredients  638 non-null    object
dtypes: object(6)
memory usage: 34.9+ KB


## Test block

In [185]:
text = df['ingredients'][0]
print(text)

pattern = re.compile(r"([a-zA-Z]+)\\n")
new = re.findall(pattern,text)

print(new)

-Pumice Buffing Beads: Exfoliate particles to remove dead cells.
-Glycolic and Lactic Acids: Exfoliate to help skin appear bright.
-Bisabolol: Soothes skin.

Pumice, Water, Glycolic Acid, Sodium Cocoyl Isethionate, Lactic Acid, Dimethicone, Stearic Acid, Cetearyl Alcohol, Sodium Hydroxide, Palmitic Acid, Glycerin, C12-15 Alkyl Benzoate, Sorbitol, Colloidal Oatmeal, Tocopherol, Chrysanthemum Parthenium (Feverfew) Extract, Camellia Sinensis Leaf Extract, Glycyrrhiza Glabra (Licorice) Root Extract, Salix Nigra (Willow) Bark Extract, Bisabolol, Hydrogenated Coconut Acid, Xanthan Gum, Steareth-20, Steareth-21, Myristic Acid, Sodium Isethionate, Phenoxyethanol, Potassium Sorbate, Sodium Benzoate, Leuconostoc/Radish Root Ferment Filtrate, EDTA.
[]


In [163]:
text = df['price'][90]
print(text)

$30.00 


In [167]:
new = re.findall(r'(\d+).\d+',text)
print(new[0])

30


## Cont. 

In [179]:
# price

# Regex 

# \d+ one or more digit 
# capturing group (...)

pattern = r"(\d+).\d+"

for i in range(len(df)):
    content = df['price'][i]
    new = re.findall(pattern, content)
    print(new)
    df['price'][i] = new[0]
    
# only taking the whole number price 

# convert argument to numeric 
# df['price'] = pd.to_numeric(df['price'])

['28']
['42']


KeyError: 2

In [180]:
# skin_type

pattern = re.compile(r"([a-zA-Z]+)\\n")

for i in range(len(df)):
    df['skin_type'][i] = re.findall(pattern, df['skin_type'][i])

KeyError: 2

In [61]:
## list column dummies

dum = df['skin_type'].str.join('|').str.get_dummies() 
# dum = cosm['skin_type'].str.join('|').str.get_dummies()

# str.join gives Normal|Oily|Combination|Dry
# separate at | 

# separate each string in the caller series at passed separator 
# df returned with all the possible values after splitting every string
# If the text value in original data frame at same index contains the string (Column name/ Splitted values) 
# then the value at that position is 1 otherwise, 0.

dum.sample(n=20)

Unnamed: 0,Combination,Dry,Full,Light,Matte,Medium,Natural,Normal,Oily,Radiant,Sensitive
1117,1,1,0,0,0,0,0,1,1,0,1
476,1,1,0,0,0,0,0,1,1,0,1
901,1,1,0,0,0,0,0,1,1,0,0
808,0,0,0,0,0,0,0,0,0,0,0
679,1,1,0,0,0,0,0,1,1,0,1
746,0,0,0,0,0,0,0,0,0,0,0
1419,1,1,0,0,0,0,0,1,1,0,1
1443,0,0,0,0,0,0,0,0,0,0,0
240,1,1,0,0,0,0,0,1,1,0,1
712,1,1,0,0,0,0,0,1,1,0,1


In [62]:
df3 = df.join(dum).drop('skin_type', axis=1)
# cosm3 = cosm.join(dum).drop('skin_type', axis = 1)

df3.sample(n=10)

Unnamed: 0,Label,brand,name,price,rank,ingredients,Combination,Dry,Full,Light,Matte,Medium,Natural,Normal,Oily,Radiant,Sensitive
181,Moisturizer,DR. DENNIS GROSS SKINCARE,Ferulic + Retinol Wrinkle Recovery Overnight S...,88,4.4,-Ferulic Acid (Plant-based Antioxidant): Enhan...,1,1,0,0,0,0,0,1,1,0,1
858,Mask,ORIGINS,"Hello, Calm Relaxing & Hydrating Face Mask wit...",28,4.8,-Cannabis Sativa Seed Oil: Nourishes with esse...,1,1,0,0,0,0,0,1,1,0,0
1319,SPF,CLARINS,Sunscreen Multi-Protection Broad Spectrum SPF 50,43,3.9,-Exclusive Organic Alpine Sanicle Extract: Hel...,0,0,0,0,0,0,0,0,0,0,0
24,Moisturizer,BELIF,The True Cream Moisturizing Bomb,38,4.6,"-Comfrey Leaf: Detoxifies skin, leaving it smo...",0,1,0,0,0,0,0,1,0,0,0
229,Moisturizer,KATE SOMERVILLE,Wrinkle Warrior™ 2-in-1 Plumping Moisturizer +...,95,3.9,-Hyaluronic Acid: Hydrates and reduces the app...,1,1,0,0,0,0,0,1,1,0,0
583,Treatment,LANCÔME,Advanced Génifique Youth Activating Serum Duo,178,2.0,"Water, Bifida Ferment Lysate, Glycerin, Alcoho...",0,0,0,0,0,0,0,0,0,0,0
650,Treatment,IT COSMETICS,Bye Bye Breakout™ Full-Coverage Concealer,28,3.9,-Tea Tree: Helps treat blemishes and acne-pron...,1,1,0,0,0,0,0,1,1,0,1
456,Cleanser,LANCER,The Method: Polish Blemish Control,75,4.7,"Water, Sodium C14-16 Olefin Sulfonate, Acrylat...",1,0,0,0,0,0,0,1,1,0,0
74,Moisturizer,DR. JART+,Ceramidin™ Cream,48,4.6,-5-Cera Complex: Thoroughly moisturizes and st...,1,1,0,0,0,0,0,1,0,0,1
288,Moisturizer,LANCÔME,ABSOLUE PREMIUM ßx - Absolute Replenishing Lot...,185,4.1,No Info,0,0,0,0,0,0,0,0,0,0,0


In [63]:
## tokenize ingredients

a = [t.split('\r\n\r\n') for t in df['ingredients']]
pattern = ['\r\n', '-\w+: ', 'Please', 'No Info', 'This product', 'Visit']

for i in range(len(df)):
    num = len(a[i]) # a is each entry of ingredients in a list 
    for j in range(num):
        if all(x not in a[i][j] for x in pattern):
            df3['ingredients'][i] = a[i][j]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [64]:
# save the file

df3.to_csv('body-clean.csv', encoding = 'utf-8-sig', index = False)