# Data Cleaning  

## Load and Present Data

In [1]:
import pandas as pd


In [2]:
df = pd.read_csv('../data/raw/cosmetics.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1472 entries, 0 to 1471
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Label        1472 non-null   object 
 1   Brand        1472 non-null   object 
 2   Name         1472 non-null   object 
 3   Price        1472 non-null   int64  
 4   Rank         1472 non-null   float64
 5   Ingredients  1472 non-null   object 
 6   Combination  1472 non-null   int64  
 7   Dry          1472 non-null   int64  
 8   Normal       1472 non-null   int64  
 9   Oily         1472 non-null   int64  
 10  Sensitive    1472 non-null   int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 126.6+ KB


## Handling Missing Values

In [3]:
print(df.isna().sum())

Label          0
Brand          0
Name           0
Price          0
Rank           0
Ingredients    0
Combination    0
Dry            0
Normal         0
Oily           0
Sensitive      0
dtype: int64


## Removing Duplicates 

In [4]:
print(df.duplicated().sum())

0


## Irrelevant/Erroneous Rows and Columns

In [5]:
import re
# removing irrelevant
irrelevent_columns = ["Label","Brand","Name","Price","Rank"]
df.drop(irrelevent_columns, axis=1, inplace=True)

# removing the content in the parenthesis
df['Ingredients'] = df['Ingredients'].str.replace(r"\s*\([^)]*\)", "", regex=True)

# if the content in the ingredients is not a list then remove it
expected_format = r"[^A-Za-z0-9(),\-\s]+[:].*"

df_cleaned = df[~df['Ingredients'].str.contains(r"[^A-Za-z0-9(),\-\s]+[:].*", na=False) &
                ~df['Ingredients'].str.contains(r"#NAME\?|Visit|\*", na=False) &
                ~df['Ingredients'].str.match(expected_format) &
                ~df['Ingredients'].str.startswith('-')]

# if combination dry oily sensitive is 0 remove it bc this is logic error
df_cleaned = df_cleaned.loc[(df[['Combination', 'Dry', 'Normal', 'Oily', 'Sensitive']].sum(axis=1) > 0)]

df_cleaned


Unnamed: 0,Ingredients,Combination,Dry,Normal,Oily,Sensitive
0,"Algae Extract, Mineral Oil, Petrolatum, Glycer...",1,1,1,1,1
1,"Galactomyces Ferment Filtrate, Butylene Glycol...",1,1,1,1,1
2,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0
3,"Algae Extract, Cyclopentasiloxane, Petrolatum,...",1,1,1,1,1
4,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1
...,...,...,...,...,...,...
1457,"SD Alcohol 40, Butyloctyl Salicylate, Polyeste...",1,1,1,1,1
1458,"Alcohol, Algae Extract, Aloe Barbadensis Leaf ...",1,1,1,1,1
1459,"Water, Caprylic/Caprlc Triglyceride, Glycerin,...",1,1,1,1,0
1464,"Water, Cyclopentasiloxane, Butyloctyl Salicyla...",1,1,1,1,1


## Standardisation of Ingredients 

In [6]:
# converting to lowercase
df_cleaned['Ingredients'] = df_cleaned['Ingredients'].str.lower()

# only using biological names for the ingredients - regex - pattern mathciung
for row in df_cleaned['Ingredients']:
    re.sub(r"\s*\(.*?\)", "", row)
df_cleaned

Unnamed: 0,Ingredients,Combination,Dry,Normal,Oily,Sensitive
0,"algae extract, mineral oil, petrolatum, glycer...",1,1,1,1,1
1,"galactomyces ferment filtrate, butylene glycol...",1,1,1,1,1
2,"water, dicaprylyl carbonate, glycerin, ceteary...",1,1,1,1,0
3,"algae extract, cyclopentasiloxane, petrolatum,...",1,1,1,1,1
4,"water, snail secretion filtrate, phenyl trimet...",1,1,1,1,1
...,...,...,...,...,...,...
1457,"sd alcohol 40, butyloctyl salicylate, polyeste...",1,1,1,1,1
1458,"alcohol, algae extract, aloe barbadensis leaf ...",1,1,1,1,1
1459,"water, caprylic/caprlc triglyceride, glycerin,...",1,1,1,1,0
1464,"water, cyclopentasiloxane, butyloctyl salicyla...",1,1,1,1,1


## Feature Engineering 

In [7]:
# tokenisation
# natural language toolkit
import nltk
from nltk import word_tokenize
#tokenisation library
nltk.download('punkt_tab')

df_cleaned['Ingredients'] = df_cleaned['Ingredients'].apply(lambda x: [word_tokenize(part.strip()) for part in x.split(',')])
df_cleaned

[nltk_data] Error loading punkt_tab: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1129)>


Unnamed: 0,Ingredients,Combination,Dry,Normal,Oily,Sensitive
0,"[[algae, extract], [mineral, oil], [petrolatum...",1,1,1,1,1
1,"[[galactomyces, ferment, filtrate], [butylene,...",1,1,1,1,1
2,"[[water], [dicaprylyl, carbonate], [glycerin],...",1,1,1,1,0
3,"[[algae, extract], [cyclopentasiloxane], [petr...",1,1,1,1,1
4,"[[water], [snail, secretion, filtrate], [pheny...",1,1,1,1,1
...,...,...,...,...,...,...
1457,"[[sd, alcohol, 40], [butyloctyl, salicylate], ...",1,1,1,1,1
1458,"[[alcohol], [algae, extract], [aloe, barbadens...",1,1,1,1,1
1459,"[[water], [caprylic/caprlc, triglyceride], [gl...",1,1,1,1,0
1464,"[[water], [cyclopentasiloxane], [butyloctyl, s...",1,1,1,1,1


## Cleaned Dataset


In [8]:
df_cleaned.to_csv("../data/cleaned/cleaned_dataset.csv", index=False)