This notebook contains: 
1. Handling null/missing values
2. Scaling
3. Encoding
4. Handling outliers: winsorization

In [2]:
import pandas as pd

In [9]:
df = pd.read_csv('files/HousePrices.csv')

In [16]:
print(df.shape)
df.info()
df.head()

(4600, 18)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4596 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dt

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,5/2/2014 0:00,313000.0,,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,5/2/2014 0:00,2384000.0,,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,5/2/2014 0:00,342000.0,,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,5/2/2014 0:00,420000.0,,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,5/2/2014 0:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [21]:
df.isnull().sum()    #get count of null-values in each column

date             0
price            0
bedrooms         4
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
street           0
city             0
statezip         0
country          0
dtype: int64

In [33]:
##Filling missing values
df_filled = df.fillna(df.select_dtypes(include = 'number').mean())
df_filled.isnull().sum()

date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
street           0
city             0
statezip         0
country          0
dtype: int64

In [36]:
##Scaling using StandardScaler method
from sklearn.preprocessing import StandardScaler
num_col = df.select_dtypes(include = 'number').columns
scaler = StandardScaler()
df[num_col] = scaler.fit_transform(df[num_col])
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,5/2/2014 0:00,-0.423864,,-0.843204,-0.829971,-0.193434,-0.022416,-0.085004,-0.309194,-0.667112,-0.565224,-0.672464,-0.531014,1.22167,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,5/2/2014 0:00,3.249598,,0.432802,1.568528,-0.161718,0.906555,-0.085004,4.830079,2.286416,1.789559,-0.069128,-1.674693,-0.825693,709 W Blaine St,Seattle,WA 98119,USA
2,5/2/2014 0:00,-0.372424,,-0.205201,-0.217367,-0.080978,-0.951388,-0.085004,-0.309194,0.809652,0.119171,-0.672464,-0.161,-0.825693,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,5/2/2014 0:00,-0.234071,,0.1138,-0.144686,-0.190145,-0.951388,-0.085004,-0.309194,0.809652,-0.959621,1.482306,-0.261913,-0.825693,857 170th Pl NE,Bellevue,WA 98008,USA
4,5/2/2014 0:00,-0.003482,0.659363,0.432802,-0.206984,-0.121306,-0.951388,-0.085004,-0.309194,0.809652,-0.797222,1.051352,0.175376,1.208396,9105 170th Ave NE,Redmond,WA 98052,USA


In [39]:
##One-hot encoding using pandas' getdummies method
cat_col = df.select_dtypes(include = 'object').columns
df_encd = pd.get_dummies(df, columns = cat_col, drop_first = True, dtype = int)
df_encd.tail()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,...,statezip_WA 98155,statezip_WA 98166,statezip_WA 98168,statezip_WA 98177,statezip_WA 98178,statezip_WA 98188,statezip_WA 98198,statezip_WA 98199,statezip_WA 98288,statezip_WA 98354
4595,-0.432437,-0.441012,-0.524202,-0.653458,-0.236689,-0.951388,-0.085004,-0.309194,0.809652,-0.368025,...,0,0,0,0,0,0,0,0,0,0
4596,-0.031271,-0.441012,0.432802,-0.705374,-0.202882,0.906555,-0.085004,-0.309194,-0.667112,-0.426025,...,0,0,0,0,0,0,0,0,0,0
4597,-0.239562,-0.441012,0.432802,0.904009,-0.218462,0.906555,-0.085004,-0.309194,-0.667112,1.371962,...,0,0,0,0,0,0,0,0,0,0
4598,-0.618268,0.659363,-0.205201,-0.051238,-0.229164,-0.951388,-0.085004,-0.309194,-0.667112,-0.878421,...,0,0,0,0,1,0,0,0,0,0
4599,-0.587759,-0.441012,0.432802,-0.674224,-0.188139,0.906555,-0.085004,-0.309194,0.809652,-0.391225,...,0,0,0,0,0,0,0,0,0,0
