# Techniques to perform Feature Transformation
1. Normalization
2. Standardization
3. Log Transformation
4. Robust Scaler
5. Max Absolute Scaler

In [1]:
li = [100, 200, 150, 300, 180]

In [2]:
li

[100, 200, 150, 300, 180]

In [3]:
import pandas as pd
df = pd.DataFrame(li, columns=['cost'])
df

Unnamed: 0,cost
0,100
1,200
2,150
3,300
4,180


## Normalization / Min max scaler

In [4]:
min_cost = df.cost.min()
max_cost = df.cost.max()

In [5]:
min_cost

np.int64(100)

In [6]:
max_cost

np.int64(300)

In [7]:
df['cost_norm_manual'] = (df.cost - min_cost) / (max_cost - min_cost)

In [8]:
df.head()

Unnamed: 0,cost,cost_norm_manual
0,100,0.0
1,200,0.5
2,150,0.25
3,300,1.0
4,180,0.4


In [9]:
from sklearn.preprocessing import MinMaxScaler

In [10]:
max = MinMaxScaler()

In [11]:
df['cost_norm_sklearn'] = max.fit_transform(df[['cost']])

In [12]:
df.head()

Unnamed: 0,cost,cost_norm_manual,cost_norm_sklearn
0,100,0.0,0.0
1,200,0.5,0.5
2,150,0.25,0.25
3,300,1.0,1.0
4,180,0.4,0.4


### Standardization/ standard scaler

In [13]:
mean_cost = df.cost.mean()
std = df.cost.std()

In [14]:
df['cost_std_manual'] = (df['cost'] - mean_cost)/std 

In [15]:
df.head()

Unnamed: 0,cost,cost_norm_manual,cost_norm_sklearn,cost_std_manual
0,100,0.0,0.0,-1.161738
1,200,0.5,0.5,0.18912
2,150,0.25,0.25,-0.486309
3,300,1.0,1.0,1.539978
4,180,0.4,0.4,-0.081051


In [16]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [17]:
# df["cost"] = scaler.fit_transform(df[['cost']])
df['cost_std_sklearn'] = scaler.fit_transform(df[['cost']])

In [18]:
df

Unnamed: 0,cost,cost_norm_manual,cost_norm_sklearn,cost_std_manual,cost_std_sklearn
0,100,0.0,0.0,-1.161738,-1.298863
1,200,0.5,0.5,0.18912,0.211443
2,150,0.25,0.25,-0.486309,-0.54371
3,300,1.0,1.0,1.539978,1.721748
4,180,0.4,0.4,-0.081051,-0.090618


In [19]:
std_dof = df.cost.std(ddof=0) #Delta Degrees of Freedom (n-0); sklearn uses ddof=0

In [20]:
std_dof

np.float64(66.2117814289874)

In [21]:
std_dof1 = df.cost.std(ddof=1)
std_dof1

np.float64(74.02702209328699)

In [22]:
df

Unnamed: 0,cost,cost_norm_manual,cost_norm_sklearn,cost_std_manual,cost_std_sklearn
0,100,0.0,0.0,-1.161738,-1.298863
1,200,0.5,0.5,0.18912,0.211443
2,150,0.25,0.25,-0.486309,-0.54371
3,300,1.0,1.0,1.539978,1.721748
4,180,0.4,0.4,-0.081051,-0.090618


In [23]:
df['cost_std_manual2'] = (df['cost'] - mean_cost) / std_dof1

In [24]:
df

Unnamed: 0,cost,cost_norm_manual,cost_norm_sklearn,cost_std_manual,cost_std_sklearn,cost_std_manual2
0,100,0.0,0.0,-1.161738,-1.298863,-1.161738
1,200,0.5,0.5,0.18912,0.211443,0.18912
2,150,0.25,0.25,-0.486309,-0.54371,-0.486309
3,300,1.0,1.0,1.539978,1.721748,1.539978
4,180,0.4,0.4,-0.081051,-0.090618,-0.081051


In [25]:
df['cost_std_manual3'] = (df['cost'] - mean_cost)/ std_dof

In [26]:
df

Unnamed: 0,cost,cost_norm_manual,cost_norm_sklearn,cost_std_manual,cost_std_sklearn,cost_std_manual2,cost_std_manual3
0,100,0.0,0.0,-1.161738,-1.298863,-1.161738,-1.298863
1,200,0.5,0.5,0.18912,0.211443,0.18912,0.211443
2,150,0.25,0.25,-0.486309,-0.54371,-0.486309,-0.54371
3,300,1.0,1.0,1.539978,1.721748,1.539978,1.721748
4,180,0.4,0.4,-0.081051,-0.090618,-0.081051,-0.090618


### Log Transformation

In [27]:
from sklearn.preprocessing import FunctionTransformer
import numpy as np

In [28]:
fc = FunctionTransformer(np.log1p)
df['cost_fc'] = fc.fit_transform(df[['cost']])
df

Unnamed: 0,cost,cost_norm_manual,cost_norm_sklearn,cost_std_manual,cost_std_sklearn,cost_std_manual2,cost_std_manual3,cost_fc
0,100,0.0,0.0,-1.161738,-1.298863,-1.161738,-1.298863,4.615121
1,200,0.5,0.5,0.18912,0.211443,0.18912,0.211443,5.303305
2,150,0.25,0.25,-0.486309,-0.54371,-0.486309,-0.54371,5.01728
3,300,1.0,1.0,1.539978,1.721748,1.539978,1.721748,5.70711
4,180,0.4,0.4,-0.081051,-0.090618,-0.081051,-0.090618,5.198497


In [29]:
df['fc_np'] = np.log1p(df[['cost']])

In [30]:
df

Unnamed: 0,cost,cost_norm_manual,cost_norm_sklearn,cost_std_manual,cost_std_sklearn,cost_std_manual2,cost_std_manual3,cost_fc,fc_np
0,100,0.0,0.0,-1.161738,-1.298863,-1.161738,-1.298863,4.615121,4.615121
1,200,0.5,0.5,0.18912,0.211443,0.18912,0.211443,5.303305,5.303305
2,150,0.25,0.25,-0.486309,-0.54371,-0.486309,-0.54371,5.01728,5.01728
3,300,1.0,1.0,1.539978,1.721748,1.539978,1.721748,5.70711,5.70711
4,180,0.4,0.4,-0.081051,-0.090618,-0.081051,-0.090618,5.198497,5.198497


# Encoding Concept in Feature Engineering

In [31]:
df = pd.read_csv('supershops.csv')

In [32]:
df1= df.copy()
df2= df.copy()
df3= df.copy()
df4= df.copy()
df5= df.copy()
df6= df.copy()
df7= df.copy()
df8= df.copy()

In [33]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [34]:
x = df.drop('Profit', axis=1)

In [35]:
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area
0,114523.61,136897.8,471784.1,Dhaka
1,162597.7,151377.59,443898.53,Ctg
2,153441.51,101145.55,407934.54,Rangpur
3,144372.41,118671.85,383199.62,Dhaka
4,142107.34,91391.77,366168.42,Rangpur


In [36]:
y = df['Profit']

In [37]:
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

In [38]:
x.isnull().sum() #Checking null values

Marketing Spend    0
Administration     0
Transport          1
Area               0
dtype: int64

In [39]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [40]:
df[18:21] #to see the row from 18 to 20

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
18,91749.16,114175.79,294919.57,Rangpur,124266.9
19,86419.7,153514.11,,Dhaka,122776.86
20,76253.86,113867.3,298664.47,Ctg,118474.03


In [46]:
import seaborn as sns

ModuleNotFoundError: No module named 'seaborn'

In [47]:
sns.histplot(df.Transport, kde=True, bins=50)

NameError: name 'sns' is not defined

# Measure of Central Tendency

In [48]:
df.Transport.mean()

np.float64(215331.7324489796)

In [49]:
df.Transport.median()

np.float64(214634.81)

In [50]:
mean = df.Transport.mean()
mean

np.float64(215331.7324489796)

In [51]:
df.Transport = df.Transport.fillna(mean)
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [52]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

In [53]:
mean

np.float64(215331.7324489796)

In [54]:
df[18:21]

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
18,91749.16,114175.79,294919.57,Rangpur,124266.9
19,86419.7,153514.11,215331.732449,Dhaka,122776.86
20,76253.86,113867.3,298664.47,Ctg,118474.03


In [55]:
df2.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

In [56]:
df2.Transport = df2.Transport.fillna(mean)
df2.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [57]:
df2.isnull().sum()

Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

# SKlearn preprocessing
** Some Common Encoding techniques in ML are: **
1. Label Encoding
2. One-Hot Encoding
3. Binary Encoding
4. Ordinal Encoding
5. Frequency Encoding
6. Mean Encoding
7. Text Encoding

In [58]:
df1.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [59]:
from sklearn.preprocessing import LabelEncoder

In [60]:
le = LabelEncoder()

In [61]:
df1.Area = le.fit_transform(df[['Area']])

  y = column_or_1d(y, warn=True)


In [62]:
df1.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


In [63]:
le.classes_

array(['Ctg', 'Dhaka', 'Rangpur'], dtype=object)

In [64]:
le.transform(le.classes_)

array([0, 1, 2])

### Decode the encoded values back to original classes

In [65]:
decoded_classes = le.inverse_transform(df1.Area)
decoded_classes

array(['Dhaka', 'Ctg', 'Rangpur', 'Dhaka', 'Rangpur', 'Dhaka', 'Ctg',
       'Rangpur', 'Dhaka', 'Ctg', 'Rangpur', 'Ctg', 'Rangpur', 'Ctg',
       'Rangpur', 'Dhaka', 'Ctg', 'Dhaka', 'Rangpur', 'Dhaka', 'Ctg',
       'Dhaka', 'Rangpur', 'Rangpur', 'Dhaka', 'Ctg', 'Rangpur', 'Dhaka',
       'Rangpur', 'Dhaka', 'Rangpur', 'Dhaka', 'Ctg', 'Rangpur', 'Ctg',
       'Dhaka', 'Rangpur', 'Ctg', 'Dhaka', 'Ctg', 'Ctg', 'Rangpur', 'Ctg',
       'Dhaka', 'Ctg', 'Dhaka', 'Rangpur', 'Ctg', 'Dhaka', 'Ctg'],
      dtype=object)

In [66]:
df1.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


In [67]:
df1.Area = decoded_classes
df1.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


### One Hot Encoder

In [68]:
df4.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [69]:
pd.get_dummies(df4['Area']).head()

Unnamed: 0,Ctg,Dhaka,Rangpur
0,False,True,False
1,True,False,False
2,False,False,True
3,False,True,False
4,False,False,True


In [71]:
pd.get_dummies(df4['Area'], drop_first=True).head()

Unnamed: 0,Dhaka,Rangpur
0,True,False
1,False,False
2,False,True
3,True,False
4,False,True


In [72]:
dummy = pd.get_dummies(df4['Area'], drop_first=True, prefix='Area', prefix_sep=' ')

In [73]:
dummy.head()

Unnamed: 0,Area Dhaka,Area Rangpur
0,True,False
1,False,False
2,False,True
3,True,False
4,False,True


In [74]:
df4.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [75]:
df4.drop('Area', axis=1, inplace=True)

In [76]:
df4.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit
0,114523.61,136897.8,471784.1,192261.83
1,162597.7,151377.59,443898.53,191792.06
2,153441.51,101145.55,407934.54,191050.39
3,144372.41,118671.85,383199.62,182901.99
4,142107.34,91391.77,366168.42,166187.94


In [77]:
dummy.head()

Unnamed: 0,Area Dhaka,Area Rangpur
0,True,False
1,False,False
2,False,True
3,True,False
4,False,True


In [78]:
new_df4 = pd.concat([df4, dummy], axis=1)

In [79]:
new_df4.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit,Area Dhaka,Area Rangpur
0,114523.61,136897.8,471784.1,192261.83,True,False
1,162597.7,151377.59,443898.53,191792.06,False,False
2,153441.51,101145.55,407934.54,191050.39,False,True
3,144372.41,118671.85,383199.62,182901.99,True,False
4,142107.34,91391.77,366168.42,166187.94,False,True


#### One-Hot encoding with LabelBinarizer

In [80]:
df5.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [81]:
from sklearn.preprocessing import LabelBinarizer

In [83]:
lb = LabelBinarizer()
area_encoded = lb.fit_transform(df5['Area'])

area_encoded_df = pd.DataFrame(area_encoded, columns=lb.classes_)
area_encoded_df.head()

Unnamed: 0,Ctg,Dhaka,Rangpur
0,0,1,0
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


In [84]:
lb.classes_

array(['Ctg', 'Dhaka', 'Rangpur'], dtype='<U7')

In [85]:
area_encoded_df = area_encoded_df.drop(columns=lb.classes_[0])
area_encoded_df.head()

Unnamed: 0,Dhaka,Rangpur
0,1,0
1,0,0
2,0,1
3,1,0
4,0,1


In [86]:
df5.drop('Area', axis=1, inplace=True)
df5.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit
0,114523.61,136897.8,471784.1,192261.83
1,162597.7,151377.59,443898.53,191792.06
2,153441.51,101145.55,407934.54,191050.39
3,144372.41,118671.85,383199.62,182901.99
4,142107.34,91391.77,366168.42,166187.94


In [88]:
new_df5 = pd.concat([df5, area_encoded_df], axis=1)
new_df5.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit,Dhaka,Rangpur
0,114523.61,136897.8,471784.1,192261.83,1,0
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1
3,144372.41,118671.85,383199.62,182901.99,1,0
4,142107.34,91391.77,366168.42,166187.94,0,1
