# Feature Engineering


Faeture engineering deals with preparing data for it to be processed by a ML algorithm and train a model. It is an extreme important part of the Machine learning process as it ensures our model is precise, perfectly fitted, and efficient


In [158]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np

In [159]:
ds = pd.read_csv("Data/Housing.csv")
ds.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [160]:
ds.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

## Imputation

Not really necessary as there are no missing values


#### a. Normal


In [161]:
threshold=0.7

In [162]:
ds = ds[ds.columns[ds.isnull().mean() < threshold]]
ds

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished


In [163]:
ds = ds.loc[ds.isnull().mean(axis=1) < threshold]
ds

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished


##### b. Numerical imputation


In [164]:
ds = ds.fillna(ds.mean())
ds

  ds = ds.fillna(ds.mean())


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished


##### c. Categorical Imputation


In [165]:
ds['bathrooms'].fillna(ds['bathrooms'].value_counts().idxmax(), inplace=True)
ds.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


## Outlier detection


a. With standard deviation

In [166]:
factor = 2

In [167]:
parking_upper_limit= ds['parking'].mean()+factor*ds['parking'].std();
parking_lower_limit= ds['parking'].mean()-factor*ds['parking'].std();

In [168]:
print(f"upper - {parking_upper_limit}")
print(f"lower - {parking_lower_limit}")

upper - 2.416749482572466
lower - -1.0295935192697137


In [169]:
ds = ds[(ds['parking']>=parking_lower_limit) & (ds['parking'] <=parking_upper_limit)]
ds.count()

price               533
area                533
bedrooms            533
bathrooms           533
stories             533
mainroad            533
guestroom           533
basement            533
hotwaterheating     533
airconditioning     533
parking             533
prefarea            533
furnishingstatus    533
dtype: int64

b. With percentile

In [170]:
cutoff = 0.03

In [171]:
ds['bedrooms'].describe()

count    533.000000
mean       2.958724
std        0.739706
min        1.000000
25%        2.000000
50%        3.000000
75%        3.000000
max        6.000000
Name: bedrooms, dtype: float64

In [172]:
ds['bedrooms'].value_counts()

3    293
2    135
4     91
5     10
6      2
1      2
Name: bedrooms, dtype: int64

In [173]:
upper_lim_bedrooms = ds['bedrooms'].quantile(1-cutoff)
lower_lim_bedrooms = ds['bedrooms'].quantile(cutoff)

In [174]:
temp_ds = ds[(ds['bedrooms'] < upper_lim_bedrooms) & (ds['bedrooms'] > lower_lim_bedrooms)]

In [175]:
temp_ds

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
5,10850000,7500,3,3,1,yes,no,yes,no,yes,2,yes,semi-furnished
9,9800000,5750,3,2,4,yes,yes,no,no,yes,1,yes,unfurnished
10,9800000,13200,3,1,2,yes,no,yes,no,yes,2,yes,furnished
14,9240000,7800,3,2,2,yes,no,no,no,no,0,yes,semi-furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
533,2100000,2400,3,1,2,yes,no,no,no,no,0,no,unfurnished
537,1890000,1700,3,1,2,yes,no,no,no,no,0,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished


In [176]:
temp_ds['bedrooms'].describe()

count    293.0
mean       3.0
std        0.0
min        3.0
25%        3.0
50%        3.0
75%        3.0
max        3.0
Name: bedrooms, dtype: float64

In [177]:
temp_ds['bedrooms'].value_counts()

3    293
Name: bedrooms, dtype: int64

In [178]:
ds.guestroom.replace(('yes','no'),(1,0),inplace=True)  # One hot encoding, more ahead

In [179]:
ds['bedrooms'] = ds['bedrooms'] + ds['guestroom']
del ds['guestroom']
ds.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,yes,2,yes,furnished
2,12250000,9960,3,2,2,yes,yes,no,no,2,yes,semi-furnished
4,11410000,7420,5,1,2,yes,yes,no,yes,2,no,furnished
5,10850000,7500,3,3,1,yes,yes,no,yes,2,yes,semi-furnished
6,10150000,8580,4,3,4,yes,no,no,yes,2,yes,semi-furnished


## Binning

In [180]:
ds['parking'].value_counts()

0    299
1    126
2    108
Name: parking, dtype: int64

In [181]:
conditions = [ds['parking'] == 0,ds['parking'] > 0]
choices = [0,1]
ds['parking'] = np.select(conditions,choices,default=None)
ds

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,yes,1,yes,furnished
2,12250000,9960,3,2,2,yes,yes,no,no,1,yes,semi-furnished
4,11410000,7420,5,1,2,yes,yes,no,yes,1,no,furnished
5,10850000,7500,3,3,1,yes,yes,no,yes,1,yes,semi-furnished
6,10150000,8580,4,3,4,yes,no,no,yes,1,yes,semi-furnished
...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,yes,no,no,1,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,0,no,furnished


In [182]:
ds['parking'].value_counts()

0    299
1    234
Name: parking, dtype: int64

## One hot encoding

In [183]:
ds.mainroad.replace(('yes','no'),(1,0),inplace=True)


In [184]:
ds.basement.replace(('yes','no'),(1,0),inplace=True)


In [185]:
ds.hotwaterheating.replace(('yes','no'),(1,0),inplace=True)


In [186]:
ds.airconditioning.replace(('yes','no'),(1,0),inplace=True)


In [187]:
ds.prefarea.replace(('yes','no'),(1,0),inplace=True)


In [188]:
ds.furnishingstatus.replace(('furnished','semi-furnished','unfurnished'),(2,1,0),inplace=True)


In [189]:
ds.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,1,1,1,2
2,12250000,9960,3,2,2,1,1,0,0,1,1,1
4,11410000,7420,5,1,2,1,1,0,1,1,0,2
5,10850000,7500,3,3,1,1,1,0,1,1,1,1
6,10150000,8580,4,3,4,1,0,0,1,1,1,1


In [190]:
ds['area'].describe()

count      533.000000
mean      5104.887430
std       2160.626731
min       1650.000000
25%       3540.000000
50%       4500.000000
75%       6325.000000
max      16200.000000
Name: area, dtype: float64

## Log transformation

In [191]:
ds['area_log'] = np.log(ds['area'])
ds.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,area_log
0,13300000,7420,4,2,3,1,0,0,1,1,1,2,8.911934
2,12250000,9960,3,2,2,1,1,0,0,1,1,1,9.206332
4,11410000,7420,5,1,2,1,1,0,1,1,0,2,8.911934
5,10850000,7500,3,3,1,1,1,0,1,1,1,1,8.922658
6,10150000,8580,4,3,4,1,0,0,1,1,1,1,9.057189
