In [23]:
## Import necessary libraries
import pandas as pd
import numpy as np

# data encoding libraries
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler

# create a sample data

employees = pd.DataFrame({
    "EmployeeName": ["Alice", "Bob", "Charlie", "David"],
    "Department": ["HR", "HR", "IT", "IT"],
    "Salary": [5000, 160000, 75000, 8000]
})

employees.head()

Unnamed: 0,EmployeeName,Department,Salary
0,Alice,HR,5000
1,Bob,HR,160000
2,Charlie,IT,75000
3,David,IT,8000


## Data Encoding

In [5]:
True == 1, False == 0

(True, True)

In [6]:
## Pandas get dummies
pd.get_dummies(employees, drop_first=True)

Unnamed: 0,Salary,EmployeeName_Bob,EmployeeName_Charlie,EmployeeName_David,Department_IT
0,5000,False,False,False,False
1,160000,True,False,False,False
2,75000,False,True,False,True
3,8000,False,False,True,True


In [10]:
## One hot encoding

# instantiate the OneHotEncoder
ohe = OneHotEncoder(drop='first',sparse_output=False)

encoded_df = ohe.fit_transform(employees[['Department', 'EmployeeName']])

encoded_df = pd.DataFrame(encoded_df, columns=ohe.get_feature_names_out(['Department', 'EmployeeName']))
encoded_df

Unnamed: 0,Department_IT,EmployeeName_Bob,EmployeeName_Charlie,EmployeeName_David
0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0
2,1.0,0.0,1.0,0.0
3,1.0,0.0,0.0,1.0


In [11]:
employees_le = employees.copy()
employees_le

Unnamed: 0,EmployeeName,Department,Salary
0,Alice,HR,5000
1,Bob,HR,160000
2,Charlie,IT,75000
3,David,IT,8000


In [None]:
## Label Encoder

le = LabelEncoder()

# Fit and transform the 'Department' column
employees['EmployeeName'] = le.fit_transform(employees['EmployeeName'])
employees['Department'] = le.fit_transform(employees['Department'])

# for loop 
for col in employees.columns:
    if employees[col].dtype == 'object':
        employees[col] = le.fit_transform(employees[col])

In [19]:
employees_ca = employees_le.copy()
employees_ca

Unnamed: 0,EmployeeName,Department,Salary
0,Alice,HR,5000
1,Bob,HR,160000
2,Charlie,IT,75000
3,David,IT,8000


In [21]:
## Ordinal Encoder

oe = OrdinalEncoder()

# Fit and transform the 'Department' column
employees_encoded = oe.fit_transform(employees[['Department', 'EmployeeName']])

employees_encoded_df = pd.DataFrame(employees_encoded, columns=employees[['Department', 'EmployeeName']].columns)

employees_encoded_df

Unnamed: 0,Department,EmployeeName
0,0.0,0.0
1,0.0,1.0
2,1.0,2.0
3,1.0,3.0


## Data Scaling

- Standard scaler, reducing the mean to 0 and unit variance
- Min Max scaler, you normalize the data to be in a [0 - 1] scale

In [24]:
standard_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()

# Fit and transform the 'Salary' column
employees['Salary_Standard'] = standard_scaler.fit_transform(employees[['Salary']])
employees['Salary_MinMax'] = minmax_scaler.fit_transform(employees[['Salary']])

employees

Unnamed: 0,EmployeeName,Department,Salary,Salary_Standard,Salary_MinMax
0,Alice,HR,5000,-0.903,0.0
1,Bob,HR,160000,1.552527,1.0
2,Charlie,IT,75000,0.205947,0.451613
3,David,IT,8000,-0.855474,0.019355


In [25]:
employees.describe()

Unnamed: 0,Salary,Salary_Standard,Salary_MinMax
count,4.0,4.0,4.0
mean,62000.0,-2.775558e-17,0.367742
std,72888.042001,1.154701,0.470245
min,5000.0,-0.9030004,0.0
25%,7250.0,-0.8673556,0.014516
50%,41500.0,-0.3247633,0.235484
75%,96250.0,0.5425923,0.58871
max,160000.0,1.552527,1.0
