In [7]:
# library
install.packages("GGally")
library(tidyverse)
library(repr)
library(tidymodels)
library(GGally)
install.packages("cowplot")
library(cowplot)
options(repr.matrix.max.rows = 6)

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done



# Finding investment properties in Melbourne

# Introduction
In recent years, global housing prices have been experiencing different changes in response to the market. In Melbourne, housing prices are also generating fluctuations and generating a lot of discussion. Known for its diversity and cultural scene, Melbourne has also been a popular property market that attracts investors from all over the world, and thus Melbourne's house prices have been stimulating interest in research. In this project, we will focus on Melbourne's house prices, hoping to create a model that can predict house prices more accurately. Along with information on a number of influencing factors, we will be able to answer the question, "What homes in Melbourne are undervalued or overvalued?". The crux of the question is to determine if the property is worth investing in. To answer this question, we will use the Melbourne Housing Dataset. In this dataset we have 21 variables and we want to select 8 of them as the main influencing factors. We will use the corr() method to select these 8 factors, which will be the major determinants of house prices, so we will select these 8 factors to model and predict whether different houses in Melbourne are worth buying at different house prices.



In [13]:
housing_data<- read_csv("https://raw.githubusercontent.com/jun2021/DSCI-100-Group-Project/main/melb_data.csv")
housing_data
class(housing_data)



[1mRows: [22m[34m13580[39m [1mColumns: [22m[34m21[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (8): Suburb, Address, Type, Method, SellerG, Date, CouncilArea, Regionname
[32mdbl[39m (13): Rooms, Price, Distance, Postcode, Bedroom2, Bathroom, Car, Landsiz...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,⋯,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
<chr>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>
Abbotsford,85 Turner St,2,h,1480000,S,Biggin,3/12/2016,2.5,3067,⋯,1,1,202,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019
Abbotsford,25 Bloomburg St,2,h,1035000,S,Biggin,4/02/2016,2.5,3067,⋯,1,0,156,79,1900,Yarra,-37.8079,144.9934,Northern Metropolitan,4019
Abbotsford,5 Charles St,3,h,1465000,SP,Biggin,4/03/2017,2.5,3067,⋯,2,0,134,150,1900,Yarra,-37.8093,144.9944,Northern Metropolitan,4019
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
Williamstown,83 Power St,3,h,1170000,S,Raine,26/08/2017,6.8,3016,⋯,2,4,436,,1997,,-37.85274,144.8874,Western Metropolitan,6380
Williamstown,96 Verdon St,4,h,2500000,PI,Sweeney,26/08/2017,6.8,3016,⋯,1,5,866,157,1920,,-37.85908,144.8930,Western Metropolitan,6380
Yarraville,6 Agnes St,4,h,1285000,SP,Village,26/08/2017,6.3,3013,⋯,1,1,362,112,1920,,-37.81188,144.8845,Western Metropolitan,6543


In [16]:
housing_data <- housing_data[,-2] |>
                na.omit()


housing_data




Unnamed: 0_level_0,Suburb,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
Unnamed: 0_level_1,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>
2,Abbotsford,1035000,S,Biggin,4/02/2016,2.5,3067,2,1,0,156,79,1900,Yarra,-37.8079,144.9934,Northern Metropolitan,4019
3,Abbotsford,1465000,SP,Biggin,4/03/2017,2.5,3067,3,2,0,134,150,1900,Yarra,-37.8093,144.9944,Northern Metropolitan,4019
5,Abbotsford,1600000,VB,Nelson,4/06/2016,2.5,3067,3,1,2,120,142,2014,Yarra,-37.8072,144.9941,Northern Metropolitan,4019
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
12208,Williamstown,385000,SP,Williams,29/07/2017,6.8,3016,1,1,1,0,35.64,1967,Hobsons Bay,-37.85588,144.8994,Western Metropolitan,6380
12210,Windsor,560000,PI,hockingstuart,29/07/2017,4.6,3181,2,1,1,0,61.60,2012,Stonnington,-37.85581,144.9903,Southern Metropolitan,4380
12213,Yarraville,2450000,VB,Village,29/07/2017,6.3,3013,6,3,2,1087,388.50,1920,Maribyrnong,-37.81038,144.8939,Western Metropolitan,6543


In [17]:
housing_split<- initial_split(housing_data,prop = 0.75,strata = Price)
housing_testing<- testing(housing_split)
housing_training<- training(housing_split)



In [None]:
housing_pairplot <- housing_training |> 
    select(-Suburb, -SellerG, -Date, -CouncilArea)|>
    ggpairs(
        lower = list(continuous = wrap('points', alpha = 0.4)),
        diag = list(continuous = "barDiag")
    ) +
    theme(text = element_text(size = 20))


housing_pairplot



[1m[22m`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
[1m[22m`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
[1m[22m`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
[1m[22m`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
[1m[22m`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
[1m[22m`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
[1m[22m`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
[1m[22m`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
[1m[22m`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
[1m[22m`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
[1m[22m`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
[1m[22m`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
[1m[22m`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
[1m[22m`st

In [7]:
lm_spec <- linear_reg() |>
           set_engine("lm") |>
           set_mode("regression")

In [9]:
lm_recipe <- recipe(Price ~ ., data = housing_training)

lm_fit  <- workflow() |>
           add_recipe(lm_recipe) |>
           add_model(lm_spec) |>
           fit(data = housing_training)
            