<a href="https://colab.research.google.com/github/lalitroy/ML/blob/main/Feature_Transform.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Explore functions used in feature transform

#### Import necessary libraries

In [77]:
import pandas as pd
import numpy as np

#### Read the csv file directly from github 

In [78]:
df = pd.read_csv('https://raw.githubusercontent.com/lalitroy/ML/main/Feature_Transform/supershops.csv')

#### Check for NaNs

```isnull()``` returns true/false on every single data in the dataframe. ```isnull().sum()``` returns the count of ```Nan```s in every column

In [79]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

## Drop the row where data is missing

In [80]:
df.dropna(inplace=True)

# MinMaxScaler / Normalization

$x_{normalized} = \frac{x_i - x_{min}}{x_{max}-x_{min}}$

In [81]:
from sklearn.preprocessing import MinMaxScaler

In [82]:
df_norm = df.copy()
model = MinMaxScaler()
df_norm['Marketing Spend'] =  model.fit_transform(df_norm[['Marketing Spend']])
df_norm['Transport'] =  model.fit_transform(df_norm[['Transport']])
df_norm['Administration'] =  model.fit_transform(df_norm[['Administration']])
df_norm.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.692617,0.651744,1.0,Dhaka,192261.83
1,0.983359,0.761972,0.940893,Ctg,191792.06
2,0.927985,0.379579,0.864664,Rangpur,191050.39
3,0.873136,0.512998,0.812235,Dhaka,182901.99
4,0.859438,0.305328,0.776136,Rangpur,166187.94


# Standardization

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

$x_{new} = \frac{x_i - x_{mean}}{\sigma}$ where, $\sigma$ is standard derviation (population)

In [83]:
from sklearn.preprocessing import StandardScaler 

In [84]:
df_std = df.copy()

model_std = StandardScaler()

df_std['Marketing Spend'] = model_std.fit_transform(df_std[['Marketing Spend']])
df_std['Transport'] =  model_std.fit_transform(df_std[['Transport']])
df_std['Administration'] =  model_std.fit_transform(df_std[['Administration']])
df_std.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.895248,0.586658,2.165287,Dhaka,192261.83
1,1.943398,1.110709,1.929843,Ctg,191792.06
2,1.743767,-0.707282,1.626191,Rangpur,191050.39
3,1.546035,-0.072973,1.417348,Dhaka,182901.99
4,1.49665,-1.060289,1.27355,Rangpur,166187.94


# Max Abs Scaler

$x_{new} = \frac{x_i}{|x_{max}|}$


https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html

In [85]:
from sklearn.preprocessing import MaxAbsScaler

In [86]:
df_mas = df.copy()

model_mas = MaxAbsScaler()

df_mas['Marketing Spend'] = model_mas.fit_transform(df_mas[['Marketing Spend']])
df_mas['Administration'] = model_mas.fit_transform(df_mas[['Administration']])
df_mas['Transport'] = model_mas.fit_transform(df_mas[['Transport']])

df_mas.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.692617,0.749527,1.0,Dhaka,192261.83
1,0.983359,0.828805,0.940893,Ctg,191792.06
2,0.927985,0.553781,0.864664,Rangpur,191050.39
3,0.873136,0.649738,0.812235,Dhaka,182901.99
4,0.859438,0.500378,0.776136,Rangpur,166187.94


# Robust Scaler

$x_{new} = \frac{x_i - x_{med}}{x_{75}-x_{25}}$

https://youtu.be/U9N-ELpCpc8


In [87]:
from sklearn.preprocessing import RobustScaler

In [88]:
df_rs = df.copy()

model_rs = RobustScaler()

df_rs['Marketing Spend'] = model_rs.fit_transform(df_rs[['Marketing Spend']])
df_rs['Administration'] = model_rs.fit_transform(df_rs[['Administration']])
df_rs['Transport'] = model_rs.fit_transform(df_rs[['Transport']])
df_rs.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.669502,0.347651,1.552016,Dhaka,192261.83
1,1.428312,0.700141,1.383714,Ctg,191792.06
2,1.283789,-0.522689,1.166654,Rangpur,191050.39
3,1.140641,-0.096035,1.017368,Dhaka,182901.99
4,1.104889,-0.760132,0.914576,Rangpur,166187.94
