In [1]:
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns

# 1. Gathering Data


- Reading the housing information data using read_csv to represent in tabular format 

In [2]:
data = pd.read_csv('melbourne.csv')
data.shape

(13518, 10)

# 2. Exploratory Data Analysis



- Let's check the first five rows from data

In [3]:
data.head()

Unnamed: 0,suburb,rooms,type,price,bedroom,bathroom,garage,size,authority,region
0,Abbotsford,2,h,1480000,2,1,1,202,Yarra,Northern Metropolitan
1,Abbotsford,2,h,1035000,2,1,0,156,Yarra,Northern Metropolitan
2,Abbotsford,3,h,1465000,3,2,0,134,Yarra,Northern Metropolitan
3,Abbotsford,3,h,850000,3,2,1,94,Yarra,Northern Metropolitan
4,Abbotsford,4,h,1600000,3,1,2,120,Yarra,Northern Metropolitan


- Let's check the last five rows from data

In [4]:
data.tail()

Unnamed: 0,suburb,rooms,type,price,bedroom,bathroom,garage,size,authority,region
13513,Wheelers Hill,4,h,1245000,4,2,2,652,,South-Eastern Metropolitan
13514,Williamstown,3,h,1031000,3,2,2,333,,Western Metropolitan
13515,Williamstown,3,h,1170000,3,2,4,436,,Western Metropolitan
13516,Williamstown,4,h,2500000,4,1,5,866,,Western Metropolitan
13517,Yarraville,4,h,1285000,4,1,1,362,,Western Metropolitan


- Let's check the random ten number of data samples, Every time it will print the random five sample of records from original datasets. So we can easly understand the behaviour and what types of data type stored in particular features.

In [5]:
data.sample(10)

Unnamed: 0,suburb,rooms,type,price,bedroom,bathroom,garage,size,authority,region
4832,Prahran,2,h,1055000,2,1,0,203,Stonnington,Southern Metropolitan
5687,South Yarra,3,h,1400000,3,2,2,0,Stonnington,Southern Metropolitan
3394,Ivanhoe,3,h,1850000,3,2,2,684,Banyule,Eastern Metropolitan
129,Alphington,3,h,1540000,3,1,2,592,Darebin,Northern Metropolitan
3870,Malvern East,4,h,1580000,4,2,1,504,Stonnington,Southern Metropolitan
11514,St Kilda,3,h,1000000,3,1,1,198,Port Phillip,Southern Metropolitan
3436,Keilor East,3,t,770000,3,2,1,331,Moonee Valley,Western Metropolitan
12294,Fawkner,2,t,515000,2,1,2,177,,Northern Metropolitan
131,Alphington,4,h,1830000,4,2,2,606,Darebin,Northern Metropolitan
4600,Pascoe Vale,4,h,975000,4,1,1,733,Moreland,Northern Metropolitan


### Target Feature

In [6]:
TARGET_FEATURE = 'price'

Y = data[TARGET_FEATURE]

Y.head()

0    1480000
1    1035000
2    1465000
3     850000
4    1600000
Name: price, dtype: int64

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13518 entries, 0 to 13517
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   suburb     13518 non-null  object
 1   rooms      13518 non-null  int64 
 2   type       13518 non-null  object
 3   price      13518 non-null  int64 
 4   bedroom    13518 non-null  int64 
 5   bathroom   13518 non-null  int64 
 6   garage     13518 non-null  int64 
 7   size       13518 non-null  int64 
 8   authority  12211 non-null  object
 9   region     13518 non-null  object
dtypes: int64(6), object(4)
memory usage: 1.0+ MB


As we can see in the output.

1. There are **13580 entries**
1. There are total **21 features (0 to 20)**
1. There are three types of datatype dtypes: **float64(12), int64(1), object(8)**
1. It's Memory usage that is, memory usage: **2.2+ MB**
1. Also, We can check how many missing values available in the **Non-Null Count** column

In [8]:
data.describe()

Unnamed: 0,rooms,price,bedroom,bathroom,garage,size
count,13518.0,13518.0,13518.0,13518.0,13518.0,13518.0
mean,2.939784,1074796.0,2.916408,1.53536,1.610075,558.110593
std,0.956438,639858.6,0.966692,0.69231,0.962634,3998.19456
min,1.0,85000.0,0.0,0.0,0.0,0.0
25%,2.0,650000.0,2.0,1.0,1.0,178.0
50%,3.0,901000.0,3.0,1.0,2.0,442.5
75%,3.0,1328000.0,3.0,2.0,2.0,651.0
max,10.0,9000000.0,20.0,8.0,10.0,433014.0


Here,`describe()` method provides us the complete calculations details about the dataset. i.e. let's take the `price` feature for example. It shows the what's the `min`, `max`, `mean(average)` and `std(standard deviation)` of price feature.

In [9]:
data.price.describe([.2, .4, .6, .8])

count    1.351800e+04
mean     1.074796e+06
std      6.398586e+05
min      8.500000e+04
20%      6.000000e+05
40%      8.000000e+05
50%      9.010000e+05
60%      1.037600e+06
80%      1.450000e+06
max      9.000000e+06
Name: price, dtype: float64

### Categorical Features

In [10]:
numeric_features = data.select_dtypes(['int', 'float']).columns

numeric_features , len(numeric_features)

(Index(['rooms', 'price', 'bedroom', 'bathroom', 'garage', 'size'], dtype='object'),
 6)

### Numerical Features

In [11]:
categorical_features = data.select_dtypes('object').columns

categorical_features, len(categorical_features)

(Index(['suburb', 'type', 'authority', 'region'], dtype='object'), 4)

In [12]:
print("Number of `Numerical` Features are:", len(numeric_features) )
print("Number of `Categorical` Features are:", len(categorical_features) )


Number of `Numerical` Features are: 6
Number of `Categorical` Features are: 4


### Find the Missing Values

Here, We need to find how many missing values are there in our datasets.

In [13]:
data.isna().sum().sort_values(ascending=False)

authority    1307
suburb          0
rooms           0
type            0
price           0
bedroom         0
bathroom        0
garage          0
size            0
region          0
dtype: int64

Here, We convert the number of missing values into percentages. So, we can easly understand to how many percentage of missing values available.

In [14]:
(data.isna().sum() * 100 / data.isna().count()).sort_values(ascending=False)

authority    9.66859
suburb       0.00000
rooms        0.00000
type         0.00000
price        0.00000
bedroom      0.00000
bathroom     0.00000
garage       0.00000
size         0.00000
region       0.00000
dtype: float64

- In abouve output, We can clearly see that, There are only four feature `'BuildingArea', 'YearBuilt', 'CouncilArea'` and `'Car'` has null values. So we have to fill some statastical values.

### Filling Missing Values

In [15]:
data[['price', 'size', 'garage']].describe(include='all')

Unnamed: 0,price,size,garage
count,13518.0,13518.0,13518.0
mean,1074796.0,558.110593,1.610075
std,639858.6,3998.19456,0.962634
min,85000.0,0.0,0.0
25%,650000.0,178.0,1.0
50%,901000.0,442.5,2.0
75%,1328000.0,651.0,2.0
max,9000000.0,433014.0,10.0


- As we can see There is one feature have categorical values and rest are numerical features.


In [16]:
# Now, is there any missing values are there?
data.isna().any()

suburb       False
rooms        False
type         False
price        False
bedroom      False
bathroom     False
garage       False
size         False
authority     True
region       False
dtype: bool

#### Categorical Features

In [17]:
print("Total Records :", len(data) )

for col in categorical_features:
    print("Total Unique Records of "+ col + " =",  len(data[col].unique()))

Total Records : 13518
Total Unique Records of suburb = 314
Total Unique Records of type = 3
Total Unique Records of authority = 34
Total Unique Records of region = 8


###### Removing columns which has huge number of unique values

In [18]:
categorical_features = categorical_features.drop('suburb')

In [19]:
# Let's see again the number of unique records

print("Total Records :", len(data) )

for col in categorical_features:
    print("Total Unique Records of "+ col + " =",  len(data[col].unique()))

Total Records : 13518
Total Unique Records of type = 3
Total Unique Records of authority = 34
Total Unique Records of region = 8


# 3. Data Visualizations


In [20]:
data.sample(4)


Unnamed: 0,suburb,rooms,type,price,bedroom,bathroom,garage,size,authority,region
9043,Boronia,3,h,706500,3,2,3,723,Knox,Eastern Metropolitan
1168,Brighton East,2,u,750000,2,1,3,305,Bayside,Southern Metropolitan
4248,Newport,2,h,922000,2,1,0,301,Hobsons Bay,Western Metropolitan
7666,Burwood,3,h,1590000,3,2,4,862,Monash,Southern Metropolitan


## Feature Selection


- Here, We need to convert categorical values to numerical values

In [21]:
data[categorical_features].value_counts()

type  authority          region                    
h     Moreland           Northern Metropolitan         823
      Boroondara         Southern Metropolitan         755
      Darebin            Northern Metropolitan         689
      Moonee Valley      Western Metropolitan          665
      Maribyrnong        Western Metropolitan          486
                                                      ... 
t     Banyule            Northern Metropolitan           1
      Frankston          South-Eastern Metropolitan      1
      Moonee Valley      Northern Metropolitan           1
u     Cardinia           Eastern Victoria                1
t     Greater Dandenong  South-Eastern Metropolitan      1
Name: count, Length: 126, dtype: int64

- Implementing `LabelEncoder` to convert categorical values to numerical values.

In [23]:
from sklearn.preprocessing import LabelEncoder

In [24]:
# Here we need to define feature_columns the we convert to number in the below cell

categorical_features = [ 'suburb', 'authority', 'region']
categorical_features

['suburb', 'authority', 'region']

In [25]:
# Encoding ...

for column in categorical_features:
    l_encoder = LabelEncoder()
    data[column] = l_encoder.fit_transform(data[column])
    

- Creating Training Feature to train the model to predict the beter accuracy.

In [26]:

training_features = list(numeric_features) + list(categorical_features)

# Remove 'Price' Feature from list
training_features.remove('price')

# show the final list
training_features

['rooms',
 'bedroom',
 'bathroom',
 'garage',
 'size',
 'suburb',
 'authority',
 'region']

### Scalling Dataset

- Here, We are using `MinMaxScaler` to notmalize our dataset. Firstly, we need to import that class from `sklearn.preprocessing` package

In [27]:
from sklearn.preprocessing import MinMaxScaler
minMaxNorm = MinMaxScaler()
minMaxNorm.fit(data[training_features])
X = minMaxNorm.transform(data[training_features]) 

In [28]:
#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
#scaler.fit(data[training_features])


Create `X` data and assignning from `training feature` columns from `data` and make it normalized.

In [29]:
scaled_data = minMaxNorm.transform(data[training_features])
scaled_data

array([[0.11111111, 0.1       , 0.125     , ..., 0.        , 0.93939394,
        0.28571429],
       [0.11111111, 0.1       , 0.125     , ..., 0.        , 0.93939394,
        0.28571429],
       [0.22222222, 0.15      , 0.25      , ..., 0.        , 0.93939394,
        0.28571429],
       ...,
       [0.22222222, 0.15      , 0.25      , ..., 0.97444089, 1.        ,
        0.85714286],
       [0.33333333, 0.2       , 0.125     , ..., 0.97444089, 1.        ,
        0.85714286],
       [0.33333333, 0.2       , 0.125     , ..., 1.        , 1.        ,
        0.85714286]])

In [30]:
from sklearn.decomposition import PCA

# Create PCA object
#pca = PCA(n_components=n_components)
pca = PCA()

# Fit PCA to the scaled features
pca.fit(scaled_data)

# Explained variance ratio for each component
explained_variance_ratio = pca.explained_variance_ratio_

# Print explained variance ratio
print("Explained variance ratio per component:")
print(explained_variance_ratio)

# Cumulative explained variance ratio
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

# Print cumulative explained variance ratio
print("\nCumulative explained variance ratio:")
print(cumulative_explained_variance)


Explained variance ratio per component:
[3.62853250e-01 2.72405794e-01 2.65422983e-01 6.43833993e-02
 2.15974969e-02 1.23580226e-02 6.99493101e-04 2.79561732e-04]

Cumulative explained variance ratio:
[0.36285325 0.63525904 0.90068203 0.96506543 0.98666292 0.99902095
 0.99972044 1.        ]


In [31]:
#Transform the data using the fitted PCA model
X = pca.transform(scaled_data)
X

array([[ 4.33218295e-01,  5.54677992e-01,  6.55310062e-02, ...,
         4.00254703e-03, -6.09011837e-04, -7.51288098e-04],
       [ 4.35927582e-01,  5.53663504e-01,  6.49230197e-02, ...,
        -6.25705224e-05,  3.55630689e-05, -6.83101613e-04],
       [ 4.31765170e-01,  5.59790110e-01,  6.71413159e-02, ...,
         3.90511375e-02,  7.76275746e-04, -1.16837511e-03],
       ...,
       [ 4.21866218e-01, -4.47752714e-01, -4.34596522e-01, ...,
         4.92097515e-02, -9.94066266e-04, -1.02763822e-03],
       [ 4.18410962e-01, -4.45628458e-01, -4.30883893e-01, ...,
        -1.17585650e-01,  1.91420337e-03,  2.43790675e-04],
       [ 4.32504407e-01, -4.74975618e-01, -4.31874240e-01, ...,
        -1.33791460e-01,  4.49684741e-03, -2.24813229e-04]])

In [32]:
Y = data['price']  
Y

0        1480000
1        1035000
2        1465000
3         850000
4        1600000
          ...   
13513    1245000
13514    1031000
13515    1170000
13516    2500000
13517    1285000
Name: price, Length: 13518, dtype: int64

## Splite Train and Test Dataset

### Splites the main data

- split data into training and validation data, for both features and target. The split is based on a random number generator.
- Supplying a numeric value to the random_state argument guarantees we get the same split eve run this script.

In [33]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, random_state = 0)

##### Check the train and test sized

In [34]:
print("Total size: ", data.shape[0])
print("Train size: ", train_X.shape, train_Y.shape)
print("Test size: ", test_X.shape, test_Y.shape)

Total size:  13518
Train size:  (10138, 8) (10138,)
Test size:  (3380, 8) (3380,)


# Model Building

#### Before Implementing ML Models we can create dataframe to stores the prediction values by each models that we are implementing below here.

In [35]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import  RandomForestRegressor
from sklearn.ensemble import  BaggingRegressor 
from sklearn.ensemble import  AdaBoostRegressor
from sklearn.ensemble import  GradientBoostingRegressor

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.tree import DecisionTreeRegressor

from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error


### Linear Regression Model

In [36]:
lr_model = LinearRegression()
lr_model.fit(train_X, train_Y)
lr_model_predicted = lr_model.predict(test_X)

###### Store model and it's predictoin score in dataframe that we created below the model building section

In [37]:
print('model_name', lr_model.__class__.__name__)

lr_model_score = lr_model.score(test_X, test_Y )
print('prediction_score', lr_model_score)

mae = mean_absolute_error(test_Y, lr_model_predicted)
print('Mean Absolute Error', mae)

mse = mean_squared_error(test_Y, lr_model_predicted)
print("Mean Squared Error:", mse)


model_name LinearRegression
prediction_score 0.31879497288486924
Mean Absolute Error 365947.74436788994
Mean Squared Error: 261607458842.05637


Now, let's see the model_summary dataframe

### Decision Tree Regressor Model

In [38]:
Dtree_model = DecisionTreeRegressor(random_state=1)
Dtree_model.fit(train_X, train_Y)
Dtree_model_predicted = Dtree_model.predict(test_X)

In [39]:
print('model_name', Dtree_model.__class__.__name__)

Dtree_model_score = Dtree_model.score(test_X, test_Y)
print('prediction_score', Dtree_model_score)

mae = mean_absolute_error(Dtree_model_predicted, test_Y)
print('Mean Absolute Error', mae)

mse = mean_squared_error(lr_model_predicted, test_Y)
print("Mean Squared Error:", mse)

model_name DecisionTreeRegressor
prediction_score 0.30622502911125726
Mean Absolute Error 299789.58617140516
Mean Squared Error: 261607458842.05637


### Random Forest Regressor Model

Here, We implement other models from ensemble package.

In [40]:
RFRModel = RandomForestRegressor(max_leaf_nodes=100, random_state=1)
RFRModel.fit(train_X, train_Y)
RFRModel_predicted = RFRModel.predict(test_X)

In [41]:
print('model_name', RFRModel.__class__.__name__)

RFRModel_score = RFRModel.score(test_X, test_Y)
print('prediction_score', RFRModel_score)

mae = mean_absolute_error(RFRModel_predicted, test_Y)
print('mean_absolute_error', mae)

mse = mean_squared_error(RFRModel_predicted, test_Y)
print("Mean Squared Error:", mse)
 

model_name RandomForestRegressor
prediction_score 0.5857642382710591
mean_absolute_error 274423.445065866
Mean Squared Error: 159081569680.0438


In [42]:
BGR_model = BaggingRegressor()
BGR_model.fit(train_X, train_Y)
BGR_model_predicted = BGR_model.predict(test_X)

In [43]:
print('model_name', BGR_model.__class__.__name__)

BGR_model_score = BGR_model.score(test_X, test_Y)
print('prediction_score', BGR_model_score)

mae = mean_absolute_error(test_Y, BGR_model_predicted)
print('mean_absolute_error', mae)

mse = mean_squared_error(BGR_model_predicted, test_Y)
print("Mean Squared Error:", mse)

model_name BaggingRegressor
prediction_score 0.6075031996365436
mean_absolute_error 242537.64850357163
Mean Squared Error: 150733019369.4647


### Ada Boost Regressor Model

In [44]:
ADB_model = AdaBoostRegressor()
ADB_model.fit(train_X, train_Y)
ADB_model_predicted = ADB_model.predict(test_X)

In [45]:
print('model_name', ADB_model.__class__.__name__)

ADB_model_score = ADB_model.score(test_X, test_Y)
print('prediction_score', ADB_model_score)

mae = mean_absolute_error(test_Y, ADB_model_predicted)
print('mean_absolute_error', mae)

mse = mean_squared_error(test_Y, ADB_model_predicted)
print("Mean Squared Error:", mse)

model_name AdaBoostRegressor
prediction_score -0.43353855191935997
mean_absolute_error 640106.0868877128
Mean Squared Error: 550530842832.9641


### Gradient Boosting Regressor Model

In [46]:
GBR_model = GradientBoostingRegressor(n_estimators=150, random_state=1)
GBR_model.fit(train_X, train_Y)
GBR_model_predicted = GBR_model.predict(test_X)

In [47]:
print('model_name', GBR_model.__class__.__name__)

GBR_model_score = GBR_model.score(test_X, test_Y)
print('prediction_score', GBR_model_score)

mae = mean_absolute_error(test_Y, GBR_model_predicted)
print('mean_absolute_error', mae)

mse = mean_squared_error(test_Y, GBR_model_predicted)
print("Mean Squared Error:", mse)

model_name GradientBoostingRegressor
prediction_score 0.6058880484000411
mean_absolute_error 265103.027226932
Mean Squared Error: 151353296075.9013


### Random Forest Regressor Model

In [48]:
XGBR_model = XGBRegressor()
XGBR_model.fit(train_X, train_Y)
XGBR_model_predicted = XGBR_model.predict(test_X)

In [49]:
print('model_name', XGBR_model.__class__.__name__)

XGBR_model_score = XGBR_model.score(test_X, test_Y)
print('prediction_score', XGBR_model_score)

mae = mean_absolute_error(test_Y, XGBR_model_predicted)
print('mean_absolute_error', mae)

mse = mean_squared_error(test_Y, XGBR_model_predicted)
print("Mean Squared Error:", mse)
 

model_name XGBRegressor
prediction_score 0.6528828703837466
mean_absolute_error 228978.4265347633
Mean Squared Error: 133305578474.70082


Let's implement some hyper param tunning using `n_estimators=500, max_depth=10, learning_rate=0.05`.

In [50]:
XGBR_model_500 = XGBRegressor(n_estimators=500, max_depth=10, learning_rate=0.05)
XGBR_model_500.fit(train_X, train_Y)

XGBR_model_500_predicted = XGBR_model_500.predict(test_X)

In [51]:
print('model_name: XGBRegressor(n_estimators=500, max_depth=10, learning_rate=0.05)')

XGBR_model_500_score = XGBR_model_500.score(test_X, test_Y)
print('prediction_score', XGBR_model_500_score)

mae = mean_absolute_error(test_Y, XGBR_model_500_predicted)
print(mae)

mse = mean_squared_error(test_Y, XGBR_model_500_predicted)
print("Mean Absolete Error:", mse)

model_name: XGBRegressor(n_estimators=500, max_depth=10, learning_rate=0.05)
prediction_score 0.6262156484393583
226009.69829881657
Mean Absolete Error: 143546759748.41074


# HYPER PARAMS TUNNING 

- Using GridSearchCV.


In [52]:
xgbr_model = XGBRegressor() # {'objective': 'reg:squarederror' }

params = {
    'n_estimators': [110, 120, 130, 140], 
    'learning_rate': [ 0.05, 0.075, 0.1],
    'max_depth': [ 7, 9],
    'reg_lambda': [0.3, 0.5]
}

xgb_reg = GridSearchCV(estimator=xgbr_model, param_grid=params, cv=5, n_jobs=-1)
xgb_reg.fit(train_X, train_Y)
xgbr_model_pred = xgb_reg.predict(test_X)

In [53]:
xgbr_model_score = xgb_reg.best_score_
print("Best score: %0.3f" % xgb_reg.best_score_)

mae = mean_absolute_error(test_Y, xgbr_model_pred)
print("mean_absolute_error :", mae)

mse = mean_squared_error(test_Y, xgbr_model_pred)
print("Mean Squared Error:", mse)

print("Best parameters set:", xgb_reg.best_params_)

Best score: 0.658
mean_absolute_error : 222567.25792344674
Mean Squared Error: 125154682139.92537
Best parameters set: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 140, 'reg_lambda': 0.5}


In [54]:
rfr_model = RandomForestRegressor(random_state=35)

rfr_params_grid = {
    'n_estimators' : [600,750,800,850],
    'max_depth' : [7],
    'max_features': [5],
    'min_samples_leaf' : [3],
    'min_samples_split' : [4, 6 ,9]
}

gscv_rfr_cv = GridSearchCV(estimator=rfr_model, 
                                      param_grid=rfr_params_grid,
                                      cv = 5 ,
                                      n_jobs = -1,
                                      verbose = 5)

gscv_rfr_cv.fit(train_X, train_Y)
gscv_rfr_cv_pred = gscv_rfr_cv.predict(test_X)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [55]:
print('model_name Random Forest Regressor Hyper Params Tunning')

gscv_rfr_cv_score = gscv_rfr_cv.best_score_
print("Best score: %0.3f" % gscv_rfr_cv.best_score_)

mae = mean_absolute_error(test_Y, gscv_rfr_cv_pred)
print("mean_absolute_error :", mae)

print("Best parameters set:", gscv_rfr_cv.best_params_)

mse = mean_squared_error(test_Y, gscv_rfr_cv_pred)
print("Mean Squared Error:", mse)

model_name Random Forest Regressor Hyper Params Tunning
Best score: 0.539
mean_absolute_error : 285723.6897784329
Best parameters set: {'max_depth': 7, 'max_features': 5, 'min_samples_leaf': 3, 'min_samples_split': 4, 'n_estimators': 600}
Mean Squared Error: 166653136467.54254
