# Data Cleaning  

## Load and Present Data

In [236]:
import pandas as pd
%matplotlib


Using matplotlib backend: macosx


In [237]:
df = pd.read_csv('../data/raw/cosmetics.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1472 entries, 0 to 1471
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Label        1472 non-null   object 
 1   Brand        1472 non-null   object 
 2   Name         1472 non-null   object 
 3   Price        1472 non-null   int64  
 4   Rank         1472 non-null   float64
 5   Ingredients  1472 non-null   object 
 6   Combination  1472 non-null   int64  
 7   Dry          1472 non-null   int64  
 8   Normal       1472 non-null   int64  
 9   Oily         1472 non-null   int64  
 10  Sensitive    1472 non-null   int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 126.6+ KB


## Handling Missing Values

In [238]:
print(df.isna().sum())

Label          0
Brand          0
Name           0
Price          0
Rank           0
Ingredients    0
Combination    0
Dry            0
Normal         0
Oily           0
Sensitive      0
dtype: int64


## Removing Duplicates 

In [239]:
print(df.duplicated().sum())

0


## Irrelevant/Erroneous Rows and Columns

In [240]:
import re
# removing irrelevant
irrelevent_columns = ["Label","Brand","Name","Price","Rank"]
df.drop(irrelevent_columns, axis=1, inplace=True)

# if the content in the ingredients is not a list then remove it
expected_format = r"[^A-Za-z0-9(),\-\s]+[:].*"

df_cleaned = df[~df['Ingredients'].str.contains(r"[^A-Za-z0-9(),\-\s]+[:].*", na=False) &
                ~df['Ingredients'].str.contains(r"#NAME\?|Visit|\*", na=False) &
                ~df['Ingredients'].str.match(expected_format) &
                ~df['Ingredients'].str.startswith('-')]

# if combination dry oily sensitive is 0 remove it bc this is logic error
df_cleaned = df_cleaned.loc[(df[['Combination', 'Dry', 'Normal', 'Oily', 'Sensitive']].sum(axis=1) > 0)]

df_cleaned


Unnamed: 0,Ingredients,Combination,Dry,Normal,Oily,Sensitive
0,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1
1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1
2,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0
3,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1
4,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1
...,...,...,...,...,...,...
1457,"SD Alcohol 40, Butyloctyl Salicylate, Polyeste...",1,1,1,1,1
1458,"Alcohol (Organic), Algae Extract (Organic), Al...",1,1,1,1,1
1459,"Water, Caprylic/Caprlc Triglyceride, Glycerin,...",1,1,1,1,0
1464,"Water, Cyclopentasiloxane, Butyloctyl Salicyla...",1,1,1,1,1


## Standardisation of Ingredients 

In [241]:
# converting to lowercase
df_cleaned['Ingredients'] = df_cleaned['Ingredients'].str.lower()

# only using biological names for the ingredients - regex - pattern mathciung
for row in df_cleaned['Ingredients']:
    re.sub(r"\s*\(.*?\)", "", row)
    print(row)

algae (seaweed) extract, mineral oil, petrolatum, glycerin, isohexadecane, microcrystalline wax, lanolin alcohol, citrus aurantifolia (lime) extract, sesamum indicum (sesame) seed oil, eucalyptus globulus (eucalyptus) leaf oil, sesamum indicum (sesame) seed powder, medicago sativa (alfalfa) seed powder, helianthus annuus (sunflower) seedcake, prunus amygdalus dulcis (sweet almond) seed meal, sodium gluconate, copper gluconate, calcium gluconate, magnesium gluconate, zinc gluconate, magnesium sulfate, paraffin, tocopheryl succinate, niacin, water, beta-carotene, decyl oleate, aluminum distearate, octyldodecanol, citric acid, cyanocobalamin, magnesium stearate, panthenol, limonene, geraniol, linalool, hydroxycitronellal, citronellol, benzyl salicylate, citral, sodium benzoate, alcohol denat., fragrance.
galactomyces ferment filtrate (pitera), butylene glycol, pentylene glycol, water, sodium benzoate, methylparaben, sorbic acid.
water, dicaprylyl carbonate, glycerin, cetearyl alcohol, cet

## Feature Engineering 

In [243]:
g s# tokenisation
# natural langauage toolkit
import nltk
from nltk import word_tokenize
#tokenisation library
nltk.download('punkt_tab')

df_cleaned['Ingredients'] = df_cleaned['Ingredients'].apply(lambda x:word_tokenize(x.lower))

[nltk_data] Error loading punkt_tab: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1129)>


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/hannah-ann/nltk_data'
    - '/Users/hannah-ann/venv/nltk_data'
    - '/Users/hannah-ann/venv/share/nltk_data'
    - '/Users/hannah-ann/venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


## Feature Extraction 

In [235]:
# vectorisation
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
vectorizer = TfidfVectorizer(analyzer=lambda x: x)  # Use the tokenized list directly
weight = tfidf.fit_transform(df['Ingredients'])
df_cleaned['Ingredients'] = pd.DataFrame(weight.toarray(), columns=vectorizer.get_feature_names_out())
df_cleaned



NotFittedError: Vocabulary not fitted or provided

## Cleaned Dataset


In [None]:
df_cleaned.to_csv("../data/cleaned/cleaned_dataset.csv", index=False)