### Data Cleaning
[Link](https://www.youtube.com/watch?v=FrLhfeiyha0&list=PLD_eiqVVLZDj1onVQPuREzf90xRT2WIdR&index=27)

In [11]:
import pandas as pd
data = pd.read_csv('sample-clean.csv')
data.head()

Unnamed: 0,emp_id,birth_date,first_name,last_name,gender,hire_date,salary,height,weight,birth_place
0,1,02-09-53,Georgi,Facello,M,26-06-86,500000,165,64.0,NY
1,2,02-06-64,Bezalel,Simmel,F,21-11-85,120000,155,59.0,Chicago
2,3,03-12-59,Parto,Bamford,M,28-08-86,350000,158,75.0,Los Angeles
3,4,01-05-54,Chirstian,Koblick,M,01-12-86,400000,149,77.0,Los Angeles
4,5,21-01-55,Kyoichi,Maliniak,M,12-09-89,200000,169,74.0,LA


### Simple Feature Scaling

$\displaystyle \text{Normalized Value} = \frac{\text{Old Value}}{\text{Maximum Value}}$

In [2]:
data['height'].max()

195

In [3]:
data['height'].min()

149

In [4]:
data['height'] = data['height'] / data['height'].max()
data.head()

Unnamed: 0,emp_id,birth_date,first_name,last_name,gender,hire_date,salary,height,weight,birth_place
0,1,02-09-53,Georgi,Facello,M,26-06-86,500000,0.846154,64.0,NY
1,2,02-06-64,Bezalel,Simmel,F,21-11-85,120000,0.794872,59.0,Chicago
2,3,03-12-59,Parto,Bamford,M,28-08-86,350000,0.810256,75.0,Los Angeles
3,4,01-05-54,Chirstian,Koblick,M,01-12-86,400000,0.764103,77.0,Los Angeles
4,5,21-01-55,Kyoichi,Maliniak,M,12-09-89,200000,0.866667,74.0,LA


### Min-Max Normalization

$\displaystyle \text{Normalized Value} = \frac{\text{Old Value - Min Value}}{\text{Max Value - Min Value}}$

In [6]:
data['weight'] = (data['weight'] - data['weight'].min())/(data['weight'].max() - data['weight'].min())
data.head()

Unnamed: 0,emp_id,birth_date,first_name,last_name,gender,hire_date,salary,height,weight,birth_place
0,1,02-09-53,Georgi,Facello,M,26-06-86,500000,0.846154,0.46875,NY
1,2,02-06-64,Bezalel,Simmel,F,21-11-85,120000,0.794872,0.3125,Chicago
2,3,03-12-59,Parto,Bamford,M,28-08-86,350000,0.810256,0.8125,Los Angeles
3,4,01-05-54,Chirstian,Koblick,M,01-12-86,400000,0.764103,0.875,Los Angeles
4,5,21-01-55,Kyoichi,Maliniak,M,12-09-89,200000,0.866667,0.78125,LA


### Z-score Normalization

$\displaystyle \text{Normalized Value} = \frac{\text{Old Value - Mean}(\mu)}{STD (\sigma)}$

In [7]:
data.describe()

Unnamed: 0,emp_id,salary,height,weight
count,100.0,100.0,100.0,78.0
mean,50.5,325200.0,0.858718,0.491987
std,29.011492,193294.039894,0.04609,0.247473
min,1.0,120000.0,0.764103,0.0
25%,25.75,200000.0,0.820513,0.3125
50%,50.5,300000.0,0.861538,0.5
75%,75.25,400000.0,0.888462,0.679688
max,100.0,750000.0,1.0,1.0


In [8]:
data['salary'] = (data['salary'] - data['salary'].mean()) / data['salary'].std()
data.head()

Unnamed: 0,emp_id,birth_date,first_name,last_name,gender,hire_date,salary,height,weight,birth_place
0,1,02-09-53,Georgi,Facello,M,26-06-86,0.904322,0.846154,0.46875,NY
1,2,02-06-64,Bezalel,Simmel,F,21-11-85,-1.061595,0.794872,0.3125,Chicago
2,3,03-12-59,Parto,Bamford,M,28-08-86,0.128302,0.810256,0.8125,Los Angeles
3,4,01-05-54,Chirstian,Koblick,M,01-12-86,0.386975,0.764103,0.875,Los Angeles
4,5,21-01-55,Kyoichi,Maliniak,M,12-09-89,-0.647718,0.866667,0.78125,LA


row number 2 ရဲ့ salary က **အနုတ်**နဲ့ပြနေတာက မူလတန်ဖိုးကိုယ်တိုင်က mean ထက်နည်းနေတဲ့အတွက် negative value ဘက်ရောက်နေတယ်လို့ပြောလို့ရပါတယ်။

### One-hot Encoding

In [10]:
pd.get_dummies(data, columns=['gender'])

Unnamed: 0,emp_id,birth_date,first_name,last_name,hire_date,salary,height,weight,birth_place,gender_F,gender_M
0,1,02-09-53,Georgi,Facello,26-06-86,0.904322,0.846154,0.46875,NY,False,True
1,2,02-06-64,Bezalel,Simmel,21-11-85,-1.061595,0.794872,0.31250,Chicago,True,False
2,3,03-12-59,Parto,Bamford,28-08-86,0.128302,0.810256,0.81250,Los Angeles,False,True
3,4,01-05-54,Chirstian,Koblick,01-12-86,0.386975,0.764103,0.87500,Los Angeles,False,True
4,5,21-01-55,Kyoichi,Maliniak,12-09-89,-0.647718,0.866667,0.78125,LA,False,True
...,...,...,...,...,...,...,...,...,...,...,...
95,96,16-09-54,Jayson,Mandell,14-01-90,2.197688,0.887179,0.28125,LA,False,True
96,97,27-02-52,Remzi,Waschkowski,15-09-90,-0.647718,0.876923,,Chicago,False,True
97,98,23-09-61,Sreekrishna,Servieres,13-05-85,0.386975,0.810256,0.40625,New York,True,False
98,99,25-05-56,Valter,Sullins,18-10-88,-0.647718,0.815385,0.75000,Houston,True,False


### Data Binning

In [12]:
data = pd.read_csv('sample-clean.csv')
data.head()

Unnamed: 0,emp_id,birth_date,first_name,last_name,gender,hire_date,salary,height,weight,birth_place
0,1,02-09-53,Georgi,Facello,M,26-06-86,500000,165,64.0,NY
1,2,02-06-64,Bezalel,Simmel,F,21-11-85,120000,155,59.0,Chicago
2,3,03-12-59,Parto,Bamford,M,28-08-86,350000,158,75.0,Los Angeles
3,4,01-05-54,Chirstian,Koblick,M,01-12-86,400000,149,77.0,Los Angeles
4,5,21-01-55,Kyoichi,Maliniak,M,12-09-89,200000,169,74.0,LA


In [17]:
import numpy as np
bins = np.linspace(data['salary'].min(), data['salary'].max(), 6)
bins

array([120000., 246000., 372000., 498000., 624000., 750000.])

In [18]:
bin_names = ['very low', 'low', 'average', 'high', 'very high']
data['salary_group'] = pd.cut(data['salary'], bins=bins, labels=bin_names, include_lowest=True)
data.head()

Unnamed: 0,emp_id,birth_date,first_name,last_name,gender,hire_date,salary,height,weight,birth_place,salary_group
0,1,02-09-53,Georgi,Facello,M,26-06-86,500000,165,64.0,NY,high
1,2,02-06-64,Bezalel,Simmel,F,21-11-85,120000,155,59.0,Chicago,very low
2,3,03-12-59,Parto,Bamford,M,28-08-86,350000,158,75.0,Los Angeles,low
3,4,01-05-54,Chirstian,Koblick,M,01-12-86,400000,149,77.0,Los Angeles,average
4,5,21-01-55,Kyoichi,Maliniak,M,12-09-89,200000,169,74.0,LA,very low
