# Data Cleaning

This notebook includes the data cleaning process after scraping the raw data regarding product information from Yesstyle.com.

In [1]:
import pandas as pd
import re

In [2]:
#load dataframe
yesstyle = pd.read_csv('Data/raw_yesstyle.csv')
yesstyle

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,price,name,ingredients,rating,reviews,label
0,0,0,US$ 10.08,B.LAB - Matcha Hydrating Foam Cleanser,"Water, Glycerin, Sodium Cocoyl Glycinate, Sodi...",4.5,762,cleanser
1,1,1,US$ 16.59,iUNIK - Calendula Complete Cleansing Oil 200ml,"Helianthus Annuus (Sunflower) Seed Oil, Canola...",4.6,1691,cleanser
2,2,2,US$ 9.99,THE FACE SHOP - Rice Water Bright Light Cleans...,"Isopropyl Myristate, Caprylic/Capric Triglycer...",4.4,3967,cleanser
3,3,3,US$ 14.90,SOME BY MI - Pure Vitamin C V10 Cleansing Bar 1pc,"Glycerin, Water, Propylene Glycol, Sodium Palm...",4.6,969,cleanser
4,4,4,US$ 9.36,ISEHAN - Kiss Me Heroine Make Speedy Mascara R...,"Isohexadecane, Triethylhexanoin, Isododecane, ...",4.8,615,cleanser
...,...,...,...,...,...,...,...,...
2287,2287,27,US$ 47.10,RAFRA - UV Milk SPF 50+ PA++++,"ater, ethylhexyl methoxycinnamate, DPG, glycer...",no rating,{{::lotinfoitem.product.reviewRatingCount}},spf
2288,2288,28,US$ 57.00,Paul & Joe - Suntan Body Primer SPF 6,"Water, BG, glycerin, ethylhexyl methoxycinnama...",no rating,{{::lotinfoitem.product.reviewRatingCount}},spf
2289,2289,29,US$ 57.00,Paul & Joe - Body Primer Super UV SPF 50+ PA++++,Cyclomethicone / water / ethanol / zinc oxide ...,no rating,{{::lotinfoitem.product.reviewRatingCount}},spf
2290,2290,30,US$ 57.00,Paul & Joe - Sunprotection Body Primer Shimmer...,"Water, BG, ethylhexyl methoxycinnamate, mica, ...",no rating,{{::lotinfoitem.product.reviewRatingCount}},spf


In [5]:
#delete unnecessary columns
del yesstyle['Unnamed: 0'], yesstyle['Unnamed: 0.1']

In [6]:
#separate 'name' column into 'brand' and 'product'
yesstyle['brand'] = yesstyle['name'].apply(lambda x: x.split(' - ')[0])
yesstyle['product'] = yesstyle['name'].apply(lambda x: x.split(' - ')[1])
yesstyle

Unnamed: 0,price,name,ingredients,rating,reviews,label,brand,product
0,US$ 10.08,B.LAB - Matcha Hydrating Foam Cleanser,"Water, Glycerin, Sodium Cocoyl Glycinate, Sodi...",4.5,762,cleanser,B.LAB,Matcha Hydrating Foam Cleanser
1,US$ 16.59,iUNIK - Calendula Complete Cleansing Oil 200ml,"Helianthus Annuus (Sunflower) Seed Oil, Canola...",4.6,1691,cleanser,iUNIK,Calendula Complete Cleansing Oil 200ml
2,US$ 9.99,THE FACE SHOP - Rice Water Bright Light Cleans...,"Isopropyl Myristate, Caprylic/Capric Triglycer...",4.4,3967,cleanser,THE FACE SHOP,Rice Water Bright Light Cleansing Oil 150ml
3,US$ 14.90,SOME BY MI - Pure Vitamin C V10 Cleansing Bar 1pc,"Glycerin, Water, Propylene Glycol, Sodium Palm...",4.6,969,cleanser,SOME BY MI,Pure Vitamin C V10 Cleansing Bar 1pc
4,US$ 9.36,ISEHAN - Kiss Me Heroine Make Speedy Mascara R...,"Isohexadecane, Triethylhexanoin, Isododecane, ...",4.8,615,cleanser,ISEHAN,Kiss Me Heroine Make Speedy Mascara Remover
...,...,...,...,...,...,...,...,...
2287,US$ 47.10,RAFRA - UV Milk SPF 50+ PA++++,"ater, ethylhexyl methoxycinnamate, DPG, glycer...",no rating,{{::lotinfoitem.product.reviewRatingCount}},spf,RAFRA,UV Milk SPF 50+ PA++++
2288,US$ 57.00,Paul & Joe - Suntan Body Primer SPF 6,"Water, BG, glycerin, ethylhexyl methoxycinnama...",no rating,{{::lotinfoitem.product.reviewRatingCount}},spf,Paul & Joe,Suntan Body Primer SPF 6
2289,US$ 57.00,Paul & Joe - Body Primer Super UV SPF 50+ PA++++,Cyclomethicone / water / ethanol / zinc oxide ...,no rating,{{::lotinfoitem.product.reviewRatingCount}},spf,Paul & Joe,Body Primer Super UV SPF 50+ PA++++
2290,US$ 57.00,Paul & Joe - Sunprotection Body Primer Shimmer...,"Water, BG, ethylhexyl methoxycinnamate, mica, ...",no rating,{{::lotinfoitem.product.reviewRatingCount}},spf,Paul & Joe,Sunprotection Body Primer Shimmer SPF 30 PA+++


In [7]:
#removing 'US$ ' from each price
yesstyle['price'] = yesstyle['price'].apply(lambda x: x.split('US$')[1])
yesstyle

Unnamed: 0,price,name,ingredients,rating,reviews,label,brand,product
0,10.08,B.LAB - Matcha Hydrating Foam Cleanser,"Water, Glycerin, Sodium Cocoyl Glycinate, Sodi...",4.5,762,cleanser,B.LAB,Matcha Hydrating Foam Cleanser
1,16.59,iUNIK - Calendula Complete Cleansing Oil 200ml,"Helianthus Annuus (Sunflower) Seed Oil, Canola...",4.6,1691,cleanser,iUNIK,Calendula Complete Cleansing Oil 200ml
2,9.99,THE FACE SHOP - Rice Water Bright Light Cleans...,"Isopropyl Myristate, Caprylic/Capric Triglycer...",4.4,3967,cleanser,THE FACE SHOP,Rice Water Bright Light Cleansing Oil 150ml
3,14.90,SOME BY MI - Pure Vitamin C V10 Cleansing Bar 1pc,"Glycerin, Water, Propylene Glycol, Sodium Palm...",4.6,969,cleanser,SOME BY MI,Pure Vitamin C V10 Cleansing Bar 1pc
4,9.36,ISEHAN - Kiss Me Heroine Make Speedy Mascara R...,"Isohexadecane, Triethylhexanoin, Isododecane, ...",4.8,615,cleanser,ISEHAN,Kiss Me Heroine Make Speedy Mascara Remover
...,...,...,...,...,...,...,...,...
2287,47.10,RAFRA - UV Milk SPF 50+ PA++++,"ater, ethylhexyl methoxycinnamate, DPG, glycer...",no rating,{{::lotinfoitem.product.reviewRatingCount}},spf,RAFRA,UV Milk SPF 50+ PA++++
2288,57.00,Paul & Joe - Suntan Body Primer SPF 6,"Water, BG, glycerin, ethylhexyl methoxycinnama...",no rating,{{::lotinfoitem.product.reviewRatingCount}},spf,Paul & Joe,Suntan Body Primer SPF 6
2289,57.00,Paul & Joe - Body Primer Super UV SPF 50+ PA++++,Cyclomethicone / water / ethanol / zinc oxide ...,no rating,{{::lotinfoitem.product.reviewRatingCount}},spf,Paul & Joe,Body Primer Super UV SPF 50+ PA++++
2290,57.00,Paul & Joe - Sunprotection Body Primer Shimmer...,"Water, BG, ethylhexyl methoxycinnamate, mica, ...",no rating,{{::lotinfoitem.product.reviewRatingCount}},spf,Paul & Joe,Sunprotection Body Primer Shimmer SPF 30 PA+++


In [8]:
#convert 'price' into a float
yesstyle['price'] = yesstyle['price'].astype(float)

I decided to drop rows with **no ratings**. I noticed the less popular products tended to not have any ratings so it's almost like a feedback loop. Worse products will have little to no people reviewing them, making them less known.

In [9]:
#drop rows with no ratings
no_ratings = yesstyle['rating'] != 'no rating'
yesstyle = yesstyle[no_ratings].copy()
yesstyle

Unnamed: 0,price,name,ingredients,rating,reviews,label,brand,product
0,10.08,B.LAB - Matcha Hydrating Foam Cleanser,"Water, Glycerin, Sodium Cocoyl Glycinate, Sodi...",4.5,762,cleanser,B.LAB,Matcha Hydrating Foam Cleanser
1,16.59,iUNIK - Calendula Complete Cleansing Oil 200ml,"Helianthus Annuus (Sunflower) Seed Oil, Canola...",4.6,1691,cleanser,iUNIK,Calendula Complete Cleansing Oil 200ml
2,9.99,THE FACE SHOP - Rice Water Bright Light Cleans...,"Isopropyl Myristate, Caprylic/Capric Triglycer...",4.4,3967,cleanser,THE FACE SHOP,Rice Water Bright Light Cleansing Oil 150ml
3,14.90,SOME BY MI - Pure Vitamin C V10 Cleansing Bar 1pc,"Glycerin, Water, Propylene Glycol, Sodium Palm...",4.6,969,cleanser,SOME BY MI,Pure Vitamin C V10 Cleansing Bar 1pc
4,9.36,ISEHAN - Kiss Me Heroine Make Speedy Mascara R...,"Isohexadecane, Triethylhexanoin, Isododecane, ...",4.8,615,cleanser,ISEHAN,Kiss Me Heroine Make Speedy Mascara Remover
...,...,...,...,...,...,...,...,...
2225,24.40,IWLT - Protecting Natural Shine Tone Up Sun Cu...,"Water, Cyclopentasiloxane, Zinc Oxide (CI 7794...",5.0,1,spf,IWLT,Protecting Natural Shine Tone Up Sun Cushion R...
2226,26.50,Neogence - Watery Essence Sunscreen SPF 50+ PA...,"Decyl Glucoside, Aqua, Diethylamino Hydroxyben...",3.5,4,spf,Neogence,Watery Essence Sunscreen SPF 50+ PA++++
2233,18.80,DAYCELL - MEDI LAB Black Rose Blossom Daily Su...,"Water, Cyclopentasiloxane, Zinc Oxide, Ethylhe...",4.0,1,spf,DAYCELL,MEDI LAB Black Rose Blossom Daily Sun SPF50+ P...
2237,9.80,Country & Stream - Honey UV Lip Balm HM SPF 20...,"Mineral oil, ceresin, diisostearyl malate, pet...",5.0,1,spf,Country & Stream,Honey UV Lip Balm HM SPF 20 PA++


In [10]:
#turning 'reviews' into an integer
yesstyle['reviews'] = yesstyle['reviews'].str.replace(',','').astype('int64')

In [11]:
yesstyle.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1858 entries, 0 to 2285
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   price        1858 non-null   float64
 1   name         1858 non-null   object 
 2   ingredients  1858 non-null   object 
 3   rating       1858 non-null   object 
 4   reviews      1858 non-null   int64  
 5   label        1858 non-null   object 
 6   brand        1858 non-null   object 
 7   product      1858 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 130.6+ KB


In [12]:
yesstyle.loc[1593]['ingredients']

"*Aloe Barbadensis (Aloe Vera) Leaf Extract (95.1%), Propanediol, Chamaecyparis Obtusa (Hinoki Cypress) Leaf Extract, *Rosmarinus Officinalis (Rosemary) Leaf Water (1, 500mg), Rosa Rugosa Leaf Extract, Sodium Carbomer, Melissa Officinalis (Lemon Balm) Leaf Extract, Maltodextrin\n\nIngredients subject to change at manufacturer's discretion. For the most complete and up-to-date list of ingredients, please refer to product packaging.\n\n\n\n{{productData.majorIngredients.ingredients}}\n\n{{majorIngredientNote}}"

The above is an example of a really **messy** ingredients list. There's plenty of uninformative punctuation, extraenous messages, etc. Let's try to clean and normalize everything.

In [13]:
#groupby the ingredients, then grab the indices of the reviews with the highest number of reviews and filter with loc
yesstyle = yesstyle.iloc[yesstyle.reset_index().groupby(['ingredients'])['reviews'].idxmax()].sort_index()
yesstyle

Unnamed: 0,price,name,ingredients,rating,reviews,label,brand,product
0,10.08,B.LAB - Matcha Hydrating Foam Cleanser,"Water, Glycerin, Sodium Cocoyl Glycinate, Sodi...",4.5,762,cleanser,B.LAB,Matcha Hydrating Foam Cleanser
1,16.59,iUNIK - Calendula Complete Cleansing Oil 200ml,"Helianthus Annuus (Sunflower) Seed Oil, Canola...",4.6,1691,cleanser,iUNIK,Calendula Complete Cleansing Oil 200ml
2,9.99,THE FACE SHOP - Rice Water Bright Light Cleans...,"Isopropyl Myristate, Caprylic/Capric Triglycer...",4.4,3967,cleanser,THE FACE SHOP,Rice Water Bright Light Cleansing Oil 150ml
3,14.90,SOME BY MI - Pure Vitamin C V10 Cleansing Bar 1pc,"Glycerin, Water, Propylene Glycol, Sodium Palm...",4.6,969,cleanser,SOME BY MI,Pure Vitamin C V10 Cleansing Bar 1pc
4,9.36,ISEHAN - Kiss Me Heroine Make Speedy Mascara R...,"Isohexadecane, Triethylhexanoin, Isododecane, ...",4.8,615,cleanser,ISEHAN,Kiss Me Heroine Make Speedy Mascara Remover
...,...,...,...,...,...,...,...,...
2224,16.70,Nella - White Snow Brightening Fermented Tone-...,"Water, Cyclopentasiloxane, Titanium Dioxide, E...",4.0,4,spf,Nella,White Snow Brightening Fermented Tone-Up Sun C...
2226,26.50,Neogence - Watery Essence Sunscreen SPF 50+ PA...,"Decyl Glucoside, Aqua, Diethylamino Hydroxyben...",3.5,4,spf,Neogence,Watery Essence Sunscreen SPF 50+ PA++++
2233,18.80,DAYCELL - MEDI LAB Black Rose Blossom Daily Su...,"Water, Cyclopentasiloxane, Zinc Oxide, Ethylhe...",4.0,1,spf,DAYCELL,MEDI LAB Black Rose Blossom Daily Sun SPF50+ P...
2237,9.80,Country & Stream - Honey UV Lip Balm HM SPF 20...,"Mineral oil, ceresin, diisostearyl malate, pet...",5.0,1,spf,Country & Stream,Honey UV Lip Balm HM SPF 20 PA++


Oftentimes, there are products that come in **multiple sizes**. Naturally, these products are expected to have the same ingredient formulations because they are literally the same product, albeit coming in a different volume. After removing "duplicate" items, we reduce the size of the dataframe from 1858 to 1661, a difference of 197.

It would be difficult to tokenize ingredients if they include 'Other ingredients' or 'Active ingredient', so I decided to completely remove them.

In [14]:
#remove rows with 'Other ingredients' or 'Active ingredient'
yesstyle = yesstyle[yesstyle['ingredients'].str.contains('Other ingredients|Active ingredient', case=False)==False].copy()
yesstyle

Unnamed: 0,price,name,ingredients,rating,reviews,label,brand,product
0,10.08,B.LAB - Matcha Hydrating Foam Cleanser,"Water, Glycerin, Sodium Cocoyl Glycinate, Sodi...",4.5,762,cleanser,B.LAB,Matcha Hydrating Foam Cleanser
1,16.59,iUNIK - Calendula Complete Cleansing Oil 200ml,"Helianthus Annuus (Sunflower) Seed Oil, Canola...",4.6,1691,cleanser,iUNIK,Calendula Complete Cleansing Oil 200ml
2,9.99,THE FACE SHOP - Rice Water Bright Light Cleans...,"Isopropyl Myristate, Caprylic/Capric Triglycer...",4.4,3967,cleanser,THE FACE SHOP,Rice Water Bright Light Cleansing Oil 150ml
3,14.90,SOME BY MI - Pure Vitamin C V10 Cleansing Bar 1pc,"Glycerin, Water, Propylene Glycol, Sodium Palm...",4.6,969,cleanser,SOME BY MI,Pure Vitamin C V10 Cleansing Bar 1pc
4,9.36,ISEHAN - Kiss Me Heroine Make Speedy Mascara R...,"Isohexadecane, Triethylhexanoin, Isododecane, ...",4.8,615,cleanser,ISEHAN,Kiss Me Heroine Make Speedy Mascara Remover
...,...,...,...,...,...,...,...,...
2224,16.70,Nella - White Snow Brightening Fermented Tone-...,"Water, Cyclopentasiloxane, Titanium Dioxide, E...",4.0,4,spf,Nella,White Snow Brightening Fermented Tone-Up Sun C...
2226,26.50,Neogence - Watery Essence Sunscreen SPF 50+ PA...,"Decyl Glucoside, Aqua, Diethylamino Hydroxyben...",3.5,4,spf,Neogence,Watery Essence Sunscreen SPF 50+ PA++++
2233,18.80,DAYCELL - MEDI LAB Black Rose Blossom Daily Su...,"Water, Cyclopentasiloxane, Zinc Oxide, Ethylhe...",4.0,1,spf,DAYCELL,MEDI LAB Black Rose Blossom Daily Sun SPF50+ P...
2237,9.80,Country & Stream - Honey UV Lip Balm HM SPF 20...,"Mineral oil, ceresin, diisostearyl malate, pet...",5.0,1,spf,Country & Stream,Honey UV Lip Balm HM SPF 20 PA++


There were some products that were sets, meaning that they included multiple products, most likely products that were already in the dataframe, so I decided to remove all sets.

In [15]:
#remove rows that include things such as '[skin softener]', which indicate sets of products
yesstyle = yesstyle[yesstyle['ingredients'].str.contains('\[.*\]', regex=True)==False].copy()
yesstyle

  yesstyle = yesstyle[yesstyle['ingredients'].str.contains('\[.*\]', regex=True)==False].copy()


Unnamed: 0,price,name,ingredients,rating,reviews,label,brand,product
0,10.08,B.LAB - Matcha Hydrating Foam Cleanser,"Water, Glycerin, Sodium Cocoyl Glycinate, Sodi...",4.5,762,cleanser,B.LAB,Matcha Hydrating Foam Cleanser
1,16.59,iUNIK - Calendula Complete Cleansing Oil 200ml,"Helianthus Annuus (Sunflower) Seed Oil, Canola...",4.6,1691,cleanser,iUNIK,Calendula Complete Cleansing Oil 200ml
2,9.99,THE FACE SHOP - Rice Water Bright Light Cleans...,"Isopropyl Myristate, Caprylic/Capric Triglycer...",4.4,3967,cleanser,THE FACE SHOP,Rice Water Bright Light Cleansing Oil 150ml
3,14.90,SOME BY MI - Pure Vitamin C V10 Cleansing Bar 1pc,"Glycerin, Water, Propylene Glycol, Sodium Palm...",4.6,969,cleanser,SOME BY MI,Pure Vitamin C V10 Cleansing Bar 1pc
4,9.36,ISEHAN - Kiss Me Heroine Make Speedy Mascara R...,"Isohexadecane, Triethylhexanoin, Isododecane, ...",4.8,615,cleanser,ISEHAN,Kiss Me Heroine Make Speedy Mascara Remover
...,...,...,...,...,...,...,...,...
2224,16.70,Nella - White Snow Brightening Fermented Tone-...,"Water, Cyclopentasiloxane, Titanium Dioxide, E...",4.0,4,spf,Nella,White Snow Brightening Fermented Tone-Up Sun C...
2226,26.50,Neogence - Watery Essence Sunscreen SPF 50+ PA...,"Decyl Glucoside, Aqua, Diethylamino Hydroxyben...",3.5,4,spf,Neogence,Watery Essence Sunscreen SPF 50+ PA++++
2233,18.80,DAYCELL - MEDI LAB Black Rose Blossom Daily Su...,"Water, Cyclopentasiloxane, Zinc Oxide, Ethylhe...",4.0,1,spf,DAYCELL,MEDI LAB Black Rose Blossom Daily Sun SPF50+ P...
2237,9.80,Country & Stream - Honey UV Lip Balm HM SPF 20...,"Mineral oil, ceresin, diisostearyl malate, pet...",5.0,1,spf,Country & Stream,Honey UV Lip Balm HM SPF 20 PA++


In [16]:
#Removing superfluous information in the 'ingredients' category, only capturing the ingredients
yesstyle['ingredients'] = yesstyle['ingredients'].apply(lambda x: x.split('\n')[0])

A lot of ingredients will have additional information in **parenthesis**, such as (42%) or (568ppm) or (Licorice). It's best to remove this information from each ingredient list in order to standardize ingredients. 

For example, one product might have 'Glycyrrhiza Glabra (Licorice) Root Extract' and another might have 'Glycyrrhiza Glabra Root Extract'. Although these ingredients are exactly the same, they would not be interpreted as identical ingredients when tokenizing them.

In [17]:
#removing anything enclosed in parenthesis
yesstyle['ingredients'] = yesstyle['ingredients'].str.replace('\(.*?\)', '', regex=True)

  yesstyle['ingredients'] = yesstyle['ingredients'].str.replace('\(.*?\)', '', regex=True)


In [18]:
#remove asterisk signs from ingredients
yesstyle['ingredients'] = yesstyle['ingredients'].str.replace('[\*]+', '', regex=True)

  yesstyle['ingredients'] = yesstyle['ingredients'].str.replace('[\*]+', '', regex=True)


In [19]:
#replace 2 or more whitespace characters with a singlespace
yesstyle['ingredients'] = yesstyle['ingredients'].str.replace('[\s]{2,}', ' ', regex=True)


  yesstyle['ingredients'] = yesstyle['ingredients'].str.replace('[\s]{2,}', ' ', regex=True)


In [20]:
#remove any trailing whitespace for each ingredient in a product
yesstyle['ingredients'] = yesstyle['ingredients'].str.replace('\s,', ',', regex=True)

  yesstyle['ingredients'] = yesstyle['ingredients'].str.replace('\s,', ',', regex=True)


In [21]:
#remove any leading/trailing whitespace for each string because each list of ingredients is a string
yesstyle['ingredients'] = yesstyle['ingredients'].str.strip()

In [22]:
yesstyle.iloc[382]['ingredients']

'Water, Betaine, Glycerin, Propanediol, Oryza Sativa Extract, Phyllostachys Pubescens Shoot Bark Extract, Aspergillus Ferment Extract Filtrate, Panax Ginseng Root Extract, Cyclodextrin, Scutellaria Baicalensis Root Extract, Hyaluronic Acid, Beta-Glucan, Cellulose Gum, Xanthan Gum, Butylene Glycol, Usnea Barbata Extract, Zanthoxylum Piperitum Fruit Extract, Pulsatilla Koreana Extract, Sodium Phytate, Tamarindus Indica Seed Gum, Polyglyceryl-10 Laurate, Polyglyceryl-10 Myristate, Glucose, 1,2-Hexanediol, Ethanol, Lavandula Angustifolia Oil, Linalool'

And voila, the above is an ideal example of a clean ingredients list. 

In [25]:
#reset the index
yesstyle = yesstyle.reset_index(drop=True)
yesstyle.tail()

Unnamed: 0,price,name,ingredients,rating,reviews,label,brand,product
1569,16.7,Nella - White Snow Brightening Fermented Tone-...,"Water, Cyclopentasiloxane, Titanium Dioxide, E...",4.0,4,spf,Nella,White Snow Brightening Fermented Tone-Up Sun C...
1570,26.5,Neogence - Watery Essence Sunscreen SPF 50+ PA...,"Decyl Glucoside, Aqua, Diethylamino Hydroxyben...",3.5,4,spf,Neogence,Watery Essence Sunscreen SPF 50+ PA++++
1571,18.8,DAYCELL - MEDI LAB Black Rose Blossom Daily Su...,"Water, Cyclopentasiloxane, Zinc Oxide, Ethylhe...",4.0,1,spf,DAYCELL,MEDI LAB Black Rose Blossom Daily Sun SPF50+ P...
1572,9.8,Country & Stream - Honey UV Lip Balm HM SPF 20...,"Mineral oil, ceresin, diisostearyl malate, pet...",5.0,1,spf,Country & Stream,Honey UV Lip Balm HM SPF 20 PA++
1573,15.7,KTW - Aqua Daily Sun Cream,"Betula Platyphylla Japonica Juice, Water, Buty...",2.0,1,spf,KTW,Aqua Daily Sun Cream


Let's save the cleaned dataset as another file so that we can use that for the visualization and recommendation tool!

In [26]:
#save cleaned dataframe as csv
yesstyle.to_csv('cleaned_yesstyle.csv')