In [5]:
import pandas as pd
data1 = pd.read_csv('files/Melbourne_housing_FULL.csv')
data1 = data1.dropna() # remove null values
data1.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom,...,Landsize,BuildingArea,YearBuilt,CouncilArea,Latitude,Longtitude,Regionname,Propertycount,ParkingArea,Price
1,Airport West,154 Halsey Rd,3,t,PI,Nelson,3/9/2016,13.5,3042.0,3.0,...,303.0,225.0,2016.0,Moonee Valley City Council,-37.718,144.878,Western Metropolitan,3464.0,Detached Garage,840000.0
2,Albert Park,105 Kerferd Rd,2,h,S,hockingstuart,3/9/2016,3.3,3206.0,2.0,...,120.0,82.0,1900.0,Port Phillip City Council,-37.8459,144.9555,Southern Metropolitan,3280.0,Attached Garage,1275000.0
5,Alphington,6 Smith St,4,h,S,Brace,3/9/2016,6.4,3078.0,3.0,...,853.0,263.0,1930.0,Darebin City Council,-37.7707,145.0318,Northern Metropolitan,2211.0,Underground,2000000.0
6,Alphington,5/6 Yarralea St,3,h,S,Jellis,3/9/2016,6.4,3078.0,3.0,...,208.0,inf,2013.0,Darebin City Council,-37.7854,145.0325,Northern Metropolitan,2211.0,Outdoor Stall,1110000.0
7,Altona,158 Queen St,3,h,VB,Greg,3/9/2016,13.8,3018.0,3.0,...,352.0,242.0,2015.0,Hobsons Bay City Council,-37.87,144.825,Western Metropolitan,5301.0,Parking Pad,520000.0


In [1]:
# scaling & z-score - variable subtracted from its mean and divided by standard deviation
# if z-score = 1, value x is 1 standard deviation away from average

In [2]:
# takes every variable and transforms into z-score of attribute
# need all variables comparable to each other; apples-to-apples scale
from sklearn import preprocessing
std_scale = preprocessing.StandardScaler()
std_scale

StandardScaler()

In [7]:
# allows to get variable onto its normalized scale
data1['Bedroom_Stdscale'] = std_scale.fit_transform(data1[['Bedroom']])

In [9]:
data1['Bedroom_Stdscale'].head()

1   -0.080681
2   -1.115676
5   -0.080681
6   -0.080681
7   -0.080681
Name: Bedroom_Stdscale, dtype: float64

In [10]:
data1['Bedroom_Stdscale'].min(), data1['Bedroom_Stdscale'].max()
# does not mean a house with -3 number of bedrooms
# does mean there is a house whose number of bedrooms is about 3 standard deviations below the average number of bedrooms
# there is also a house with number of bedrooms at 9 standard deviations above average

(-3.1856679331239324, 9.234280724028467)

In [11]:
data1['Bedroom_Stdscale'].describe() # 5 point summary
# mean number of bedrooms is 0; average of standardized scale is 0

count    8.890000e+03
mean     3.836451e-17
std      1.000056e+00
min     -3.185668e+00
25%     -1.115676e+00
50%     -8.068077e-02
75%      9.543150e-01
max      9.234281e+00
Name: Bedroom_Stdscale, dtype: float64

## MinMaxScaler
scale so that max is 1, min is 0

(x - min)/(max - min)

In [13]:
minmax_scale = preprocessing.MinMaxScaler()
minmax_scale

MinMaxScaler()

In [14]:
data1['Car_MinMaxScale'] = minmax_scale.fit_transform(data1[['Car']])

In [15]:
data1['Car_MinMaxScale'].head(5)

1    0.1
2    0.0
5    0.4
6    0.2
7    0.1
Name: Car_MinMaxScale, dtype: float64

In [16]:
data1['Car_MinMaxScale'].min(), data1['Car_MinMaxScale'].max()

(0.0, 1.0)

In [17]:
data1['Car_MinMaxScale'].describe()

count    8890.000000
mean        0.169224
std         0.097534
min         0.000000
25%         0.100000
50%         0.200000
75%         0.200000
max         1.000000
Name: Car_MinMaxScale, dtype: float64

### Log Transformation

In [18]:
import numpy as np

In [19]:
log_transformer = preprocessing.FunctionTransformer(np.log1p) 
# create transformer function by transforming numpy's own logarithmic function
# log1p func inside numpy, extracted into log transformer function, other capabilities not caried over
log_transformer

FunctionTransformer(func=<ufunc 'log1p'>)

In [20]:
data1['Distance'].isnull().sum()

0

In [21]:
data1['Distance']= data1['Distance'].fillna(data1['Distance'].mean())
# imput the null values ising the mean
# fill missing values with the average

In [22]:
data1['Distance_logtransform'] = log_transformer.fit_transform(data1[['Distance']])
# transformed distance data to the logarithm

In [23]:
data1['Distance_logtransform'].head()

1    2.674149
2    1.458615
5    2.001480
6    2.001480
7    2.694627
Name: Distance_logtransform, dtype: float64

In [24]:
data1['Distance_logtransform'].describe()

count    8890.000000
mean        2.356024
std         0.550826
min         0.000000
25%         2.001480
50%         2.415914
75%         2.701361
max         3.879500
Name: Distance_logtransform, dtype: float64

### Exponential Transformation
transform the other way; tears open variability, particularly for observations that are too clumped together

In [26]:
exp_transformer = preprocessing.FunctionTransformer(np.exp)
exp_transformer

FunctionTransformer(func=<ufunc 'exp'>)

In [27]:
data1['Rooms_exptransform'] = exp_transformer.fit_transform(data1[['Rooms']])
# returns exponential transform of the data

In [28]:
data1['Rooms_exptransform'].head(5)

1    20.085537
2     7.389056
5    54.598150
6    20.085537
7    20.085537
Name: Rooms_exptransform, dtype: float64

In [29]:
data1['Rooms_exptransform'].describe()

count      8890.000000
mean         57.862291
std        1743.357006
min           2.718282
25%           7.389056
50%          20.085537
75%          54.598150
max      162754.791419
Name: Rooms_exptransform, dtype: float64

In [None]:
# transformations are a way to study variablity, processing different types of outcomes in a dataset