<DIV ALIGN=CENTER>
# IOWA housing prediction with Lasso and Ridge Models
</DIV>  
---

### First to prepare the library necessary

In [1]:
# import required library
if (!require(dummies)) {
    install.packages("dummies")   # for dummy variables
}
if (!require(DAAG)) {
    install.packages("moments")   # for data skwness judge
}
if (!require(DAAG)) {
    install.packages("corrplot")  # for correlation plot
}
if (!require(DAAG)) {
    install.packages("DAAG")      # for lm cross valiation
}
if (!require(glmnet)){
    install.packages("glmnet")    # for ridge and lasso model
}
library(dummies)   # dummy variable
library(moments)   # skewness
library(corrplot)  # corrplot
library(DAAG)      # cross-validation
library(glmnet)    # ridge and lasso

Loading required package: dummies
dummies-1.5.6 provided by Decision Patterns

Loading required package: DAAG
Loading required package: lattice
Loading required package: glmnet
Loading required package: Matrix
Loading required package: foreach
Loaded glmnet 2.0-5



In [2]:
# load the data set
train = read.csv('./train.csv')
test  = read.csv('./test.csv')

# combine into one data frame by rows

data = rbind(train[, -ncol(train)], test)

# compute the number of missing values in each column
num.NA = colSums(apply(data[, -1], 2, is.na))
# see the class of each variable
data.type = sapply(data[, names(which(num.NA != 0))], class)

In [3]:
# drop 6 columns as LotFrontage, Alley, FireplaceQu, PoolQC, Fence, MiscFeature  for more than 200 data are missing
drop.names = c("LotFrontage","Alley","FireplaceQu","PoolQC", "Fence", "MiscFeature")
data = data[ , !(names(data) %in% drop.names)]

In [4]:
# find all categorical and numerical variables
data.type = sapply(data[ , -1], class)
categorical.var = names(data)[which(c(NA, data.type, NA) == 'factor')]
numerical.var = names(data)[which(c(NA, data.type, NA) == 'integer')]

In [5]:
# create new feature named "NA" for categorical variables
for (i in categorical.var) {
    data[, i] = addNA(data[, i])
}

# create new features using the median value for all numerical variables with missing values
for (i in numerical.var) {
    na.id = is.na(data[, i])
    tmp.median = median(train[, i], na.rm=TRUE)
    data[which(na.id), i] = tmp.median
}

# transform numerical feature whose skewness is larger than 0.75
skewed.features = sapply(data[, numerical.var], skewness)
skewed.features = numerical.var[which(skewed.features > 0.75)]
for (i in skewed.features) {
    data[, i] = log(data[, i] + 1)
}

In [6]:
# create dummy variables
dummy.var = data.frame(dummy.data.frame(data[, categorical.var], sep='.'))
data = cbind(data, dummy.var)

# drop original categorical variables
data = data[ , !(names(data) %in% categorical.var)]

data.train = data[1:nrow(train), ]
data.test = data[(nrow(train) + 1):nrow(data), ]

data.train['SalePrice'] = log(train$SalePrice+1)

# Now we have (data.train) and (data.test) to continue Ridge and Lasso regression.

## Ridge regression

In [7]:
# ridge regression
pos = ncol(data.train)
cv.ridge = cv.glmnet(as.matrix(data.train[,-c(1, pos)]),data.train[,'SalePrice'],nfolds=10 )
lambda_ridge = cv.ridge$lambda.min   # this is the optimal lambda with minimal shrinkage
#lambda_ridge
# use glmnet use alpha = 0 is ridge regression
ridge.fit = glmnet(as.matrix(data.train[,-c(1, pos)]),data.train[,'SalePrice'],alpha =0,lambda = lambda_ridge)
ridge.pred = exp(predict(ridge.fit, s = lambda_ridge, newx = as.matrix(data.test[,-1])))-1
#cv.ridge$cvm
submission = read.csv('./sample_submission.csv')
submission$SalePrice = ridge.pred
write.table(submission, './ridge_Pred_all_k10.csv', row.names = FALSE, sep = ',')

## Lasso regression

In [8]:
# lasso regression
cv.lasso = cv.glmnet(as.matrix(data.train[,-c(1, pos)]),data.train[,'SalePrice'],nfolds=10 )
lambda_lasso = cv.lasso$lambda.min   # this is the optimal lambda with minimal shrinkage
#lambda_lasso
# use glmnet use alpha = 1 is lasso regression
lasso.fit = glmnet(as.matrix(data.train[,-c(1, pos)]),data.train[,'SalePrice'],alpha = 1,lambda = lambda_lasso)
lasso.pred = exp(predict(lasso.fit, s = lambda_lasso, newx = as.matrix(data.test[,-1])))-1
#cv.lasso$cvm
submission = read.csv('./sample_submission.csv')
submission$SalePrice = lasso.pred
write.table(submission, './lasso_Pred_all_k10.csv', row.names = FALSE, sep = ',')