# Julia 機器學習：GLM 線性迴歸

## 作業 027：波士頓房價預測資料集

請使用 GLM 中的模型，建立一個預測模型來預測波士頓的房價。

In [1]:
using Pkg
mInstalled = Pkg.installed()
mNeedModules = ["GLM","RDatasets","MLDataUtils"]

for i in mNeedModules
    if get(mInstalled,i,-1)==-1
        Pkg.add(i)
    else
        println(i)
    end
end

└ @ Pkg D:\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.4\Pkg\src\Pkg.jl:531


GLM
RDatasets
MLDataUtils


In [2]:
using GLM, RDatasets, MLDataUtils

## 讀取資料

In [3]:
using DataFrames
boston = dataset("MASS", "Boston")
first(boston, 10)

Unnamed: 0_level_0,Crim,Zn,Indus,Chas,NOx,Rm,Age,Dis,Rad,Tax
Unnamed: 0_level_1,Float64,Float64,Float64,Int64,Float64,Float64,Float64,Float64,Int64,Int64
1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296
2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242
3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242
4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222
5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222
6,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222
7,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311
8,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311
9,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311
10,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311


In [4]:
# CRIM - per capita crime rate by town
# ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
# INDUS - proportion of non-retail business acres per town.
# CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
# NOX - nitric oxides concentration (parts per 10 million)
# RM - average number of rooms per dwelling
# AGE - proportion of owner-occupied units built prior to 1940
# DIS - weighted distances to five Boston employment centres
# RAD - index of accessibility to radial highways
# TAX - full-value property-tax rate per $10,000
# PTRATIO - pupil-teacher ratio by town
# B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
# LSTAT - % lower status of the population
# MEDV - Median value of owner-occupied homes in $1000's
describe(boston)
indecies = MLDataUtils.shuffleobs(collect(1:nrow(boston)))
train_ind, test_ind = MLDataUtils.splitobs(indecies, at = 0.7);
println(size(boston))
println(size(train_ind))
println(size(test_ind))

(506, 14)
(354,)
(152,)


In [5]:
train = boston[train_ind, :]
test = boston[test_ind, :]
model = GLM.lm(@formula(MedV ~ Crim+Zn+Indus+Chas+NOx+Rm+Age+Dis+Rad+Tax+PTRatio+Black+LStat), train)

StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Array{Float64,1}},GLM.DensePredChol{Float64,LinearAlgebra.Cholesky{Float64,Array{Float64,2}}}},Array{Float64,2}}

MedV ~ 1 + Crim + Zn + Indus + Chas + NOx + Rm + Age + Dis + Rad + Tax + PTRatio + Black + LStat

Coefficients:
────────────────────────────────────────────────────────────────────────────────────
                 Estimate  Std. Error    t value  Pr(>|t|)    Lower 95%    Upper 95%
────────────────────────────────────────────────────────────────────────────────────
(Intercept)   40.0157      6.03563      6.62992     <1e-9    28.1439     51.8876
Crim          -0.112011    0.0366222   -3.05855     0.0024   -0.184046   -0.0399763
Zn             0.031563    0.0175724    1.79617     0.0734   -0.0030012   0.0661273
Indus         -0.0241264   0.0752897   -0.320447    0.7488   -0.172219    0.123966
Chas           1.6425      1.07797      1.52369     0.1285   -0.477841    3.76283
NOx          -17.1658      4.71295     -3.64227  

In [6]:
pred = predict(model, test)

152-element Array{Union{Missing, Float64},1}:
 21.9814852044194
 35.31150959606779
 20.091161176361876
 19.920592740911598
 33.28524773176393
 28.14840411361203
 36.77621773572644
 25.755530765455035
 16.1803931560178
 34.193630306216406
 13.155408425759198
 34.17127774693581
 14.477760728363084
  ⋮
 17.604608440040373
  6.453457668709834
 19.049550643007727
  5.366639217867931
 21.23837901066758
 35.041089012015306
 23.54429946363697
  1.5688400801335103
 13.561522188634441
 18.79563361921952
 25.113413488955963
 40.518428623132735

In [7]:
r2(model)

0.7281322527654628

In [8]:
adjr2(model)

0.7177373094888482