## Import libraries

In [86]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

## Clean data

In [176]:
df = pd.read_csv('body-care-1.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 666 entries, 0 to 665
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Label        666 non-null    object
 1   URL          666 non-null    object
 2   brand        666 non-null    object
 3   name         666 non-null    object
 4   price        666 non-null    object
 5   skin_type    666 non-null    object
 6   ingredients  661 non-null    object
dtypes: object(7)
memory usage: 36.5+ KB


In [177]:
df.head(5)

Unnamed: 0,Label,URL,brand,name,price,skin_type,ingredients
0,bath-and-body-soap,https://www.sephora.com/product/kp-bump-eraser...,FIRST AID BEAUTY,KP Bump Eraser Body Scrub with 10% AHA,$28.00,"['Normal,', 'Dry,', 'Combination\n']",-Pumice Buffing Beads: Exfoliate particles to ...
1,bath-and-body-soap,https://www.sephora.com/product/sol-de-janeiro...,SOL DE JANEIRO,Bum Bum Body Scrub,$42.00,"['Normal,', 'Dry,', 'Combination,', 'Oily\n']",-Crushed Cupuaçu Seeds: Gently buff and smooth...
2,bath-and-body-soap,https://www.sephora.com/product/brazilian-4-pl...,SOL DE JANEIRO,Brazilian 4 Play Moisturizing Shower Cream-Gel,$25.00,"['Dry,', 'Normal,', 'Combination,', 'Oily,']",
3,bath-and-body-soap,https://www.sephora.com/product/sol-de-janeiro...,SOL DE JANEIRO,Brazilian Touch Hand Sanitizer Spray,$10.00,[],"Alcohol 80%, Aqua (water), glycerine, hydrogen..."
4,bath-and-body-soap,https://www.sephora.com/product/necessaire-the...,NÉCESSAIRE,The Body Wash,$25.00,"['Normal,', 'Dry,', 'Combination,', 'Oily\n']","-Niacinamide (Vitamin B3): Cleanses, nourishes..."


In [178]:
# label
# ['bath-and-body-soap', 'body-moisturizers', 'sun-lotion', 'body-care', 'beauty-supplements-bath-body']

df.Label[df['Label'] == 'bath-and-body-soap'] = str('soap')
df.Label[df['Label'] == 'body-moisturizers'] = str('moisturizers')
df.Label[df['Label'] == 'sun-lotion'] = str('SPF')
df.Label[df['Label'] == 'body-care'] = str('body')
df.Label[df['Label'] == 'beauty-supplements-bath-body'] = str('supplements')

In [179]:
df['Label'].value_counts()

moisturizers    204
SPF             180
soap            119
supplements      98
body             65
Name: Label, dtype: int64

In [180]:
# remove duplicated items
df['name'].drop_duplicates()

# .loc accesses a group of rows and columns by a label or a boolean array
# reset_index() resets the index from 0 to length of data 

0              KP Bump Eraser Body Scrub with 10% AHA
1                                  Bum Bum Body Scrub
2      Brazilian 4 Play Moisturizing Shower Cream-Gel
3                Brazilian Touch Hand Sanitizer Spray
4                                       The Body Wash
                            ...                      
659          Sleep Welle Fortified Calming Tea Refill
661                  Beauty Collagen - Lavender Lemon
663               Beauty Collagen - Tropical Hibiscus
664                                      Night Beauty
665                             Madame Ovary Vitamins
Name: name, Length: 643, dtype: object

In [181]:
# URL
df.drop(['URL'], axis = 1, inplace = True)

In [182]:
df = df.loc[pd.notnull(df['ingredients'])]
# removed null ingredient items 

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 661 entries, 0 to 665
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Label        661 non-null    object
 1   brand        661 non-null    object
 2   name         661 non-null    object
 3   price        661 non-null    object
 4   skin_type    661 non-null    object
 5   ingredients  661 non-null    object
dtypes: object(6)
memory usage: 36.1+ KB


In [183]:
df = df.reset_index()

## Test block

## Cont. 

In [184]:
# price

# Regex 

# \d+ one or more digit 
# capturing group (...)

pattern = r"(\d+).\d+"

for i in range(len(df)):
    content = df['price'][i]
    new = re.findall(pattern, content)
    df['price'][i] = new
# only taking the whole number price 

# convert argument to numeric 
# df['price'] = pd.to_numeric(df['price'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [185]:
# skin_type

pattern = re.compile(r"([a-zA-Z]+)\\n")

for i in range(len(df)):
    df['skin_type'][i] = re.findall(pattern, df['skin_type'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [186]:
for i in df['skin_type']:
    i = np.array(i)
    i = i.tolist()
    for j in i: 
        j = j.strip(',\n')

In [187]:
df['skin_type'].sample(10)

240                     []
378                 [Oily]
591    [Oily, Combination]
210                 [Oily]
23                      []
617                     []
385                  [Dry]
576                     []
97                      []
453          [Combination]
Name: skin_type, dtype: object

In [189]:
## list column dummies

dum = df['skin_type'].str.join('|').str.get_dummies() 
# dum = cosm['skin_type'].str.join('|').str.get_dummies()

# str.join gives Normal|Oily|Combination|Dry
# separate at | 

# separate each string in the caller series at passed separator 
# df returned with all the possible values after splitting every string
# If the text value in original data frame at same index contains the string (Column name/ Splitted values) 
# then the value at that position is 1 otherwise, 0.

dum.sample(n=10)

Unnamed: 0,Combination,Dry,Normal,Oily
78,1,1,1,1
352,0,0,0,1
633,0,0,0,0
67,1,1,1,1
65,0,0,0,0
542,0,0,0,0
560,0,0,0,0
150,1,1,1,1
475,0,0,0,0
229,0,0,0,0


In [190]:
df3 = df.join(dum).drop('skin_type', axis=1)
# cosm3 = cosm.join(dum).drop('skin_type', axis = 1)

df3.sample(n=10)

Unnamed: 0,index,Label,brand,name,price,ingredients,Combination,Dry,Normal,Oily
586,591,supplements,SEPHORA COLLECTION,Sephora Collection x OLLY: Flawless Complexion,[15],-Minerals: Zinc is essential for the generatio...,0,0,0,0
212,215,moisturizers,FRESH,Fresh Life Body Lotion,[26],-Shea Butter: Moisturizes.\n-Vitamins C and E:...,0,0,0,0
377,381,SPF,MURAD,Oil and Pore Control Mattifier Broad Spectrum ...,[42],"-Oil-Trapping Microspheres: Deliver a smooth, ...",0,0,0,1
370,374,SPF,LANCÔME,Rénergie Lift Multi-Action Ultra Dark Spot Cor...,[109],"-Avobenzone (3%), Octisalate (5%), and Octocry...",0,0,0,1
516,520,body,NUFACE,NuBODY Skin Toning Device,[399],Hydrating Leave-On Gel Primer:\n-Hyaluronic ac...,0,0,0,1
257,260,moisturizers,REN CLEAN SKINCARE,Moroccan Rose Otto Body Lotion,[46],"-Rose Otto Oil Moroccan Rose: Hydrates, reduce...",0,0,0,0
643,648,supplements,MOON JUICE,Cosmic Gold,[40],-Organic Turmeric: Supports healthy inflammato...,0,0,0,0
38,39,soap,L'OCCITANE,Shea Butter Extra Gentle Soap,[12],L'Occitane's famously luxe products are nothin...,0,0,0,0
653,658,supplements,SEPHORA COLLECTION,Sephora Collection x OLLY: Radiant Sleep Trave...,[7],-Melatonin: Works with your body’s chemistry t...,0,0,0,0
598,603,supplements,HUM NUTRITION,Big Chill™ Stress Management Supplement,[20],"Free of artificial colors, flavors, and preser...",0,0,0,0


In [191]:
## tokenize ingredients

a = [t.split('\r\n\r\n') for t in df['ingredients']]
pattern = ['\r\n', '-\w+: ', 'Please', 'No Info', 'This product', 'Visit']

for i in range(len(df)):
    num = len(a[i]) # a is each entry of ingredients in a list 
    for j in range(num):
        if all(x not in a[i][j] for x in pattern):
            df3['ingredients'][i] = a[i][j]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [192]:
# save the file

df3.to_csv('body-clean.csv', encoding = 'utf-8-sig', index = False)

In [196]:
df3['ingredients'].sample(n=1)

521    Alcohol Denat., Water, Propylene Glycol, Sodiu...
Name: ingredients, dtype: object