In [2]:
library(tidyverse)
library(caret)
library(car)
library(glmnet)
library(dplyr)

setwd('C:/Users/iceca/Documents/Housing_Regression')

set.seed(42)

m <- modules::use("Helpers/Loading_Data.R")
m <- m$load.train

dat <- m$skew.data()

test <- dat[[2]] 
colnames(test) <- make.names(colnames(test))

train <- dat[[1]]
labels <- m$get.labels() %>% arrange(Id) %>% filter(Id %in% train$Id)
train <- dat[[1]] %>% arrange(Id) %>% mutate(SalePrice = labels$SalePrice, NumberLevel= as.numeric(train$NumberLevel)) 
colnames(train) <- make.names(colnames(train))
train <- train %>% mutate_if(is.logical, function(x) {as.numeric(x)})

cat("train has" , nrow(train), "rows.")
#I need to combine the train and test into one model matrix to ensure that train and test have the same factors
dat.matrix <- sparse.model.matrix(SalePrice ~. , rbind( train , test %>% mutate(SalePrice=0) ) )
train.matrix <- dat.matrix[1:nrow(train),]
test.matrix <- dat.matrix[ (1+nrow(train)) : dim(dat.matrix)[1] ,]

cross.validation.index <- createDataPartition(labels$SalePrice,times = 1,p = 0.15,list = FALSE)
cv <- train[cross.validation.index , ]
train.cv <- train[-cross.validation.index , ]
train.raw <- m$featEngRaw.data()[[1]]
cross.validation.index <- cross.validation.index %>% as.vector()
cv.matrix <- train.matrix[cross.validation.index, ]
cv.matrix.labels <- labels$SalePrice[cross.validation.index]
train.cv.matrix <- train.matrix[-cross.validation.index, ]
train.cv.matrix.labels <- labels$SalePrice[-cross.validation.index]


"package 'tidyverse' was built under R version 3.6.3"-- Attaching packages --------------------------------------- tidyverse 1.3.0 --
v ggplot2 3.3.2     v purrr   0.3.4
v tibble  3.0.3     v dplyr   1.0.1
v tidyr   1.1.1     v stringr 1.4.0
v readr   1.3.1     v forcats 0.4.0
"package 'purrr' was built under R version 3.6.3"-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
Loading required package: lattice

Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift

"package 'car' was built under R version 3.6.3"Loading required package: carData
"package 'carData' was built under R version 3.6.3"
Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

The following object is masked from 'package:purrr':

    some

Loading required package: Matrix

Attaching package: 'Matrix'

The following objects are masked fro

train has 1453 rows.

In [21]:
#HYPERTUNING THE SUPPORT VECTOR MACHINES
#similar to randomforsts I will start with a random search
library(e1071)
set.seed(5)
params <- expand.grid( cost=2^(-5:2), gamma=2^(-15:-1))
cv.length=10
costs <-  rep(0,cv.length)
gammas <- rep(0,cv.length)
errors <- rep(0,cv.length)
nrounds<- rep(0,cv.length)


for (i in 1:cv.length) {
    #select the parameters
    par <- params[sample(nrow(params) , 1),]
    costs[i] = par$cost
    gammas[i] = par$gamma
    cat("i ",i," cost ", costs[i], "gamma", gammas[i], "\n")
    
    #train the model and calulate errors
    svm.fit <- svm(SalePrice~., data.frame(x=train.cv%>% select(-SalePrice), SalePrice=train.cv$SalePrice), 
                        kernel="linear",cost=costs[i], gamma = gammas[i] )
    errors[i]= sqrt(mean( (log( predict(svm.fit , newdata = data.frame(x=cv))) - log(cv$SalePrice))^2 ))
    cat("error is ", errors[i], "\n", "\n")
    
    if(is.na(errors[i])){ stop("NANS produced")}

    #save the results to a csv incase training fails
    svm.results <- data.frame(gamma = gammas, cost=costs, errors=errors)
    write.csv(svm.results, file="Models/Trees/SMV_CVResults.csv")
}



i  1  cost  0.0625 gamma 0.0078125 
error is  0.1251981 
 
i  2  cost  0.125 gamma 0.25 
error is  0.1274331 
 
i  3  cost  0.03125 gamma 0.00390625 
error is  0.1250411 
 
i  4  cost  2 gamma 0.015625 
error is  0.132292 
 
i  5  cost  0.125 gamma 0.015625 
error is  0.1274331 
 
i  6  cost  0.03125 gamma 0.0009765625 
error is  0.1250411 
 
i  7  cost  0.5 gamma 0.03125 
error is  0.1301225 
 
i  8  cost  1 gamma 0.0625 
error is  0.1318727 
 
i  9  cost  2 gamma 0.0078125 
error is  0.132292 
 
i  10  cost  2 gamma 0.5 
error is  0.132292 
 


In [29]:
svm.fit <- svm(SalePrice~., data.frame(x=train.cv%>% select(-SalePrice), SalePrice=train.cv$SalePrice), 
                        kernel="linear",cost=0.0625 , gamma = 0.0078125 )

sqrt(mean( (log( predict(svm.fit , newdata = data.frame(x=cv))) - log(cv$SalePrice))^2 ))
test.predict <- data.frame(Id=test$Id , SalePrice = predict(svm.fit , newdata=data.frame(x=test)))
write.csv(test.predict , "Models/SupportVectorMachines/svm predictions.csv", row.names=FALSE)

In [6]:
#HYPERTUNING THE SUPPORT VECTOR MACHINES RADIAL
#similar to randomforsts I will start with a random search
library(e1071)
set.seed(2)
params <- expand.grid( cost=2^(-5:2), gamma=2^(-10:2))
cv.length=30
costs <-  rep(0,cv.length)
gammas <- rep(0,cv.length)
errors <- rep(0,cv.length)
nrounds<- rep(0,cv.length)


for (i in 1:cv.length) {
    #select the parameters
    par <- params[sample(nrow(params) , 1),]
    costs[i] = par$cost
    gammas[i] = par$gamma
    cat("i ",i," cost ", costs[i], "gamma", gammas[i], "\n")
    
    #train the model and calulate errors
    svm.fit <- svm(SalePrice~., data.frame(x=train.cv%>% select(-SalePrice), SalePrice=train.cv$SalePrice), kernel =
        "radial" ,cost=costs[i], gamma = gammas[i],  )
    errors[i]= sqrt(mean( (log( predict(svm.fit , newdata = data.frame(x=cv))) - log(cv$SalePrice))^2 ))
    cat("error is ", errors[i], "\n", "\n")
    
    if(is.na(errors[i])){ stop("NANS produced")}

    #save the results to a csv incase training fails
    svm.results <- data.frame(gamma = gammas, cost=costs, errors=errors)
    write.csv(svm.results, file="Models/Trees/SMV_radial_CVResults.csv")
}



i  1  cost  0.5 gamma 1 
error is  0.3787204 
 
i  2  cost  2 gamma 0.5 
error is  0.3846295 
 
i  3  cost  1 gamma 0.25 
error is  0.3711603 
 
i  4  cost  1 gamma 0.0009765625 
error is  0.1280997 
 
i  5  cost  4 gamma 0.0078125 
error is  0.1134759 
 
i  6  cost  4 gamma 0.0009765625 
error is  0.1145768 
 
i  7  cost  0.03125 gamma 0.00390625 
error is  0.1800395 
 
i  8  cost  0.5 gamma 2 
error is  0.3788478 
 
i  9  cost  0.03125 gamma 1 
error is  0.3791225 
 
i  10  cost  0.25 gamma 0.5 
error is  0.37864 
 
i  11  cost  0.03125 gamma 0.03125 
error is  0.2643792 
 
i  12  cost  0.0625 gamma 0.0625 
error is  0.3113057 
 
i  13  cost  0.125 gamma 0.5 
error is  0.3789586 
 
i  14  cost  0.03125 gamma 0.25 
error is  0.3785395 
 
i  15  cost  0.125 gamma 0.0009765625 
error is  0.1626027 
 
i  16  cost  4 gamma 0.5 
error is  0.3879496 
 
i  17  cost  4 gamma 2 
error is  0.3889358 
 
i  18  cost  0.0625 gamma 0.0625 
error is  0.3113057 
 
i  19  cost  2 gamma 0.0625 
error i

In [7]:
svm.fit <- svm(SalePrice~., data.frame(x=train.cv%>% select(-SalePrice), SalePrice=train.cv$SalePrice), 
                        kernel="radial",cost=2 , gamma = 0.00390625  )

sqrt(mean( (log( predict(svm.fit , newdata = data.frame(x=cv))) - log(cv$SalePrice))^2 ))
test.predict <- data.frame(Id=test$Id , SalePrice = predict(svm.fit , newdata=data.frame(x=test)))
write.csv(test.predict , "Models/SupportVectorMachines/svm radial predictions.csv", row.names=FALSE)

In [13]:
#HYPERTUNING THE SUPPORT VECTOR MACHINES POLYNOMIAL
#similar to randomforsts I will start with a random search
library(e1071)
set.seed(22)
params <- expand.grid( cost=2^(-5:2), gamma=2^(-12:2), degree = 3)
cv.length=40
costs <-  rep(0,cv.length)
gammas <- rep(0,cv.length)
errors <- rep(0,cv.length)
nrounds<- rep(0,cv.length)
degrees <- rep(0,cv.length)

for (i in 1:cv.length) {
    #select the parameters
    par <- params[sample(nrow(params) , 1),]
    costs[i] = par$cost
    gammas[i] = par$gamma
    degrees[i] = par$degree
    cat("i ",i," cost ", costs[i], "gamma", gammas[i],"degree", degrees[i], "\n")
    
    #train the model and calulate errors
    svm.fit <- svm(SalePrice~., data.frame(x=train.cv%>% select(-SalePrice), SalePrice=train.cv$SalePrice), kernel =
        "polynomial" ,cost=costs[i], gamma = gammas[i],  degree=degrees[i])
    errors[i]= sqrt(mean( (log( predict(svm.fit , newdata = data.frame(x=cv))) - log(cv$SalePrice))^2 ))
    cat("error is ", errors[i], "\n", "\n")
    
    if(is.na(errors[i])){ stop("NANS produced")}

    #save the results to a csv incase training fails
    svm.results <- data.frame(gamma = gammas, cost=costs, errors=errors)
    write.csv(svm.results, file="Models/Trees/SMV_radial_CVResults.csv")
}



i  1  cost  1 gamma 1 degree 3 
error is  0.1243793 
 
i  2  cost  0.03125 gamma 0.0004882812 degree 3 
error is  0.3793463 
 
i  3  cost  4 gamma 0.25 degree 3 
error is  0.1243793 
 
i  4  cost  0.0625 gamma 0.125 degree 3 
error is  0.1243793 
 
i  5  cost  1 gamma 0.5 degree 3 
error is  0.1243793 
 
i  6  cost  0.25 gamma 0.0078125 degree 3 
error is  0.1389529 
 
i  7  cost  0.125 gamma 0.03125 degree 3 
error is  0.1182918 
 
i  8  cost  0.03125 gamma 0.25 degree 3 
error is  0.1243793 
 
i  9  cost  0.5 gamma 1 degree 3 
error is  0.1243793 
 
i  10  cost  0.125 gamma 2 degree 3 
error is  0.1243793 
 
i  11  cost  0.125 gamma 0.0625 degree 3 
error is  0.1244172 
 
i  12  cost  4 gamma 0.0078125 degree 3 
error is  0.1181587 
 
i  13  cost  0.25 gamma 4 degree 3 
error is  0.1243793 
 
i  14  cost  4 gamma 0.0004882812 degree 3 
error is  0.3737674 
 
i  15  cost  0.0625 gamma 0.03125 degree 3 
error is  0.1181587 
 
i  16  cost  4 gamma 0.0625 degree 3 
error is  0.1243793 
 

In [15]:
svm.fit <- svm(SalePrice~., data.frame(x=train.cv%>% select(-SalePrice), SalePrice=train.cv$SalePrice), 
                        kernel="polynomial",cost=0.125 , gamma = 0.03125  )

sqrt(mean( (log( predict(svm.fit , newdata = data.frame(x=cv))) - log(cv$SalePrice))^2 ))
test.predict <- data.frame(Id=test$Id , SalePrice = predict(svm.fit , newdata=data.frame(x=test)))
write.csv(test.predict , "Models/SupportVectorMachines/svm poly predictions.csv", row.names=FALSE)