.Rhistory

training_frame = train_h2o,
model_id = "dl_fit3",
validation_frame = valid_h2o,  #in DL, early stopping is on by default
epochs = 20,
hidden = c(10,10),
score_interval = 1,           #used for early stopping
stopping_rounds = 3,          #used for early stopping
stopping_metric = "AUTO",     #used for early stopping
stopping_tolerance = 0.0005,   #used for early stopping
seed = 1)
dl_perf1 <- h2o.performance(model = dl_fit1,
newdata = valid_h2o)
dl_perf2 <- h2o.performance(model = dl_fit2,
newdata = valid_h2o)
dl_perf3 <- h2o.performance(model = dl_fit3,
newdata = valid_h2o)
dl_perf1
dl_perf2
dl_perf3
learner <- c("h2o.glm.wrapper", "h2o.randomForest.wrapper", "h2o.gbm.wrapper")
metalearner <- "h2o.glm.wrapper"
fit <- h2o.ensemble(x = x, y = y,
training_frame = train_h2o,
family = "gaussian",
learner = learner,
metalearner = metalearner,
cvControl = list(V = 5))
perf <- h2o.ensemble_performance(fit,newdata=valid_h2o)
pred <- predict(fit, valid_h2o)
predictions <- as.data.frame(pred$pred)[,1]
labels <- as.data.frame(valid_h2o[,y])[,1]
rmse(predictions,labels)
library(Metrics)
rmse(predictions,labels)
learner <- c("h2o.glm.wrapper", "h2o.randomForest.wrapper", "h2o.gbm.wrapper", "h2o.deeplearning.wrapper")
metalearner <- "h2o.gbm.wrapper"
fit <- h2o.ensemble(x = x, y = y,
training_frame = train_h2o,
family = "gaussian",
learner = learner,
metalearner = metalearner,
cvControl = list(V = 5))
perf <- h2o.ensemble_performance(fit,newdata=valid_h2o)
pred <- predict(fit, valid_h2o)
predictions <- as.data.frame(pred$pred)[,1]
labels <- as.data.frame(valid_h2o[,y])[,1]
rmse(predictions,labels)
fit <- h2o.randomForest(x = x,
y = y,
training_frame = data,
model_id = "rf_fit2",
#validation_frame = valid_h2o,  #only used if stopping_rounds > 0
ntrees = 1000,
seed = 1)
fit
rf_fit2
pred <- predict(fit, test_h2o)
predictions <- as.data.frame(pred$pred)[,1]
results <- exp(predictions) - 1
solution <- data.frame(Id=as.integer(rownames(x_test)),SalePrice=results)
write.csv(solution,"randomforest.csv",row.names=FALSE)
fit <- h2o.gbm(x = x,
y = y,
training_frame = data,
model_id = "gbm_fit3",
validation_frame = valid_h2o,  #only used if stopping_rounds > 0
ntrees = 500,
score_tree_interval = 5,      #used for early stopping
stopping_rounds = 3,          #used for early stopping
stopping_metric = "AUTO",      #used for early stopping
stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)
pred <- predict(fit, test_h2o)
predictions <- as.data.frame(pred$pred)[,1]
results <- exp(predictions) - 1
solution <- data.frame(Id=as.integer(rownames(x_test)),SalePrice=results)
write.csv(solution,"gbm.csv",row.names=FALSE)
fit <- h2o.deeplearning(x = x,
y = y,
training_frame = data,
model_id = "dl_fit1",
seed = 1)
pred <- predict(fit, test_h2o)
predictions <- as.data.frame(pred$pred)[,1]
results <- exp(predictions) - 1
solution <- data.frame(Id=as.integer(rownames(x_test)),SalePrice=results)
write.csv(solution,"nn.csv",row.names=FALSE)
lasso_mod
dl_fit1
y
x
fit1 <- h2o.glm(x = x,
y = y,
training_frame = data,
model_id = "glm_fit1",
family = "gaussian")
fit2 <- h2o.glm(x = x,
y = y,
training_frame = data,
model_id = "glm_fit2",
validation_frame = valid_h2o,
family = "gaussian",
lambda_search = TRUE)
fit3 <- h2o.glm(x = x,
y = y,
training_frame = data,
model_id = "glm_fit3",
validation_frame = valid_h2o,
family = "gaussian",
alpha = 1, # lasso penalty
lambda_search = TRUE)
fit4 <- h2o.glm(x = x,
y = y,
training_frame = data,
model_id = "glm_fit4",
validation_frame = valid_h2o,
family = "gaussian",
alpha = 0, # ridge penalty
lambda_search = TRUE)
fit5 <- h2o.randomForest(x = x,
y = y,
training_frame = data,
model_id = "rf_fit1",
seed = 1)
fit6 <- h2o.randomForest(x = x,
y = y,
training_frame = data,
model_id = "rf_fit2",
#validation_frame = valid_h2o,  #only used if stopping_rounds > 0
ntrees = 1000,
seed = 1)
fit7 <- h2o.randomForest(x = x,
y = y,
training_frame = data,
model_id = "rf_fit3",
validation_frame = valid_h2o,
stopping_rounds = 1,
stopping_metric = "MSE",
ntrees = 1000,
seed = 1)
fit8 <- h2o.randomForest(x = x,
y = y,
training_frame = data,
model_id = "rf_fit4",
validation_frame = valid_h2o,
# stopping_rounds = 1,
# stopping_metric = "MSE",
ntrees = 2000,
seed = 1)
fit9 <- h2o.gbm(x = x,
y = y,
training_frame = data,
model_id = "gbm_fit1",
seed = 1)
fit10 <- h2o.gbm(x = x,
y = y,
training_frame = data,
model_id = "gbm_fit2",
#validation_frame = valid,  #only used if stopping_rounds > 0
ntrees = 500,
seed = 1)
fit11 <- h2o.gbm(x = x,
y = y,
training_frame = data,
model_id = "gbm_fit3",
validation_frame = valid_h2o,  #only used if stopping_rounds > 0
ntrees = 500,
score_tree_interval = 5,      #used for early stopping
stopping_rounds = 3,          #used for early stopping
stopping_metric = "AUTO",      #used for early stopping
stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)
# Ensemble Stacking with default wrappers
learner <- c ("h2o.glm.wrapper", "h2o.randomForest.wrapper",
"h2o.gbm.wrapper", "h2o.deeplearning.wrapper")
metalearner <- "h2o.glm.wrapper"
fit12 <- h2o.ensemble(x = x, y = y,
training_frame = data,
family = "gaussian",
learner = learner,
metalearner = metalearner,
cvControl = list(V = 5))
pred1 <- h2o.predict(fit1, test_h2o)
results1 <- exp(pred1) - 1
pred2 <- h2o.predict(fit2, test_h2o)
results2 <- exp(pred2) - 1
pred3 <- h2o.predict(fit3, test_h2o)
results3 <- exp(pred3) - 1
pred4 <- h2o.predict(fit4, test_h2o)
results4 <- exp(pred4) - 1
pred5 <- h2o.predict(fit5, test_h2o)
results5 <- exp(pred5) - 1
pred6 <- h2o.predict(fit6, test_h2o)
results6 <- exp(pred6) - 1
pred7 <- h2o.predict(fit7, test_h2o)
results7 <- exp(pred7) - 1
pred8 <- h2o.predict(fit8, test_h2o)
results8 <- exp(pred8) - 1
pred9 <- h2o.predict(fit9, test_h2o)
results9 <- exp(pred9) - 1
pred10 <- h2o.predict(fit10, test_h2o)
results10 <- exp(pred10) - 1
pred11 <- h2o.predict(fit11, test_h2o)
results11 <- exp(pred11) - 1
pred12 <- predict(fit12, test_h2o)
predictions12 <- as.data.frame(pred12$pred)[,1]
results12 <- exp(predictions12) - 1
results12 <- as.h2o(results12)
avgresults <- (results1+results2+results3+results4+results5+results6+results7+results8+results9+results10+results11+results12)/12
avgresults <- as.data.frame(avgresults)
solution <- data.frame(Id=as.integer(rownames(x_test)),SalePrice=avgresults)
head(solution)
colnames(solution)[2] <- "SalePrice"
tail(solution)
write.csv(solution,"h2oavgmodelfinal.csv",row.names=FALSE)
learner <- c ("h2o.glm.wrapper", "h2o.randomForest.wrapper",
"h2o.gbm.wrapper", "h2o.deeplearning.wrapper")
metalearner <- "h2o.glm.wrapper"
fit <- h2o.ensemble(x = x, y = y,
training_frame = train_h2o,
family = "gaussian",
learner = learner,
metalearner = metalearner,
cvControl = list(V = 5))
h2o.rmse(glm_perf1)
h2o.rmse(glm_perf2)
h2o.rmse(glm_perf3)
h2o.rmse(glm_perf4)
fit_a <- h2o.glm(x = x,
y = y,
training_frame = train_h2o,
model_id = "glm_fit3",
validation_frame = valid_h2o,
family = "gaussian",
alpha = 1, # lasso penalty
lambda_search = TRUE)
fit_b <- h2o.gbm(x = x,
y = y,
training_frame = train_h2o,
model_id = "gbm_fit3",
validation_frame = valid_h2o,  #only used if stopping_rounds > 0
ntrees = 500,
score_tree_interval = 5,      #used for early stopping
stopping_rounds = 3,          #used for early stopping
stopping_metric = "AUTO",      #used for early stopping
stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)
fit_a <- h2o.glm(x = x,
y = y,
training_frame = train_h2o,
model_id = "glm_fit3",
validation_frame = valid_h2o,
family = "gaussian",
alpha = 1, # lasso penalty
lambda_search = TRUE)
fit_b <- h2o.gbm(x = x,
y = y,
training_frame = train_h2o,
model_id = "gbm_fit3",
validation_frame = valid_h2o,  #only used if stopping_rounds > 0
ntrees = 500,
score_tree_interval = 5,      #used for early stopping
stopping_rounds = 3,          #used for early stopping
stopping_metric = "AUTO",      #used for early stopping
stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)
pred1 <- predict(fit_a, valid_h2o)
pred2 <- predict(fit_b, valid_h2o)
pred1
totalpred <- (pred1+pred2)/2
totalpred
predictions <- as.data.frame(totalpred)[,1]
predictions
labels <- as.data.frame(valid_h2o[,y])[,1]
labels
predictions
dim(predictions)
length(predictions)
length(labels)
rmse(predictions,labels)
fit_a <- h2o.glm(x = x,
y = y,
training_frame = data,
model_id = "glm_fit3",
validation_frame = valid_h2o,
family = "gaussian",
alpha = 1, # lasso penalty
lambda_search = TRUE)
fit_b <- h2o.gbm(x = x,
y = y,
training_frame = data,
model_id = "gbm_fit3",
validation_frame = valid_h2o,  #only used if stopping_rounds > 0
ntrees = 500,
score_tree_interval = 5,      #used for early stopping
stopping_rounds = 3,          #used for early stopping
stopping_metric = "AUTO",      #used for early stopping
stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)
pred_a <- predict(fit_a,test_h2o)
pred_b <- prediction(fit_b,test_h2o)
pred_b <- predict(fit_b,test_h2o)
pred_a
full_pred <- (pred_a+pred_b)/2
results <- exp(full_pred) - 1
solution <- data.frame(Id=as.integer(rownames(x_test)),SalePrice=results)
full_pred <- (pred_a+pred_b)/2
predictions <- as.data.frame(full_pred$pred)[,1]
predictions
results <- exp(full_pred) - 1
solution <- data.frame(Id=as.integer(rownames(x_test)),SalePrice=results)
results
pred_a <- predict(fit_a,test_h2o)
pred_b <- predict(fit_b,test_h2o)
full_pred <- (pred_a+pred_b)/2
predictions <- as.data.frame(full_pred$pred)[,1]
results <- exp(predictions) - 1
solution <- data.frame(Id=as.integer(rownames(x_test)),SalePrice=results)
head(predictions)
head(solution)
tail(solution)
write.csv(solution,"2modelensemble.csv",row.names=FALSE)
h2o.shutdown(prompt=FALSE)
# Setup environment to maximize parellel computing
rm(list=ls())
memory.size(max=T)
library(doParallel)
library(foreach)
c1=makeCluster(2)
registerDoParallel(c1)
# Load the libraries
library(ggplot2)
library(dplyr)
library(data.table)
library(caret)
library(moments)
library(corrplot)
library(VIM)
library(mice)
library(plyr)
library(Amelia)
library(psych)
library(Metrics)
train <- read.csv("data/train.csv",stringsAsFactors=TRUE)
train_eda <- train # save a copy for eda purposes
str(train)
a <- rep(0,length(ncol(train)))
for(i in 1:ncol(train)){a[i]=sum(is.na(train[,i]))}
a
train <- train[,c(-4,-7,-58,-73,-74,-75)]
str(train)
# test data
test <- read.csv("data/test.csv",stringsAsFactors=TRUE)
str(test)
test$SalePrice=as.integer(rep("",length(test$Id)))
test <- test[,c(-4,-7,-58,-73,-74,-75)]
total <- rbind(train,test)
a <- rep(0,length(ncol(total)))
for(i in 1:ncol(total)){a[i]=sum(is.na(total[,i]))} # check NAs
a
nearZeroVar(train,saveMetrics = TRUE)
removeVar <- c("Street","LandContour","Utilities","LandSlope","Condition2","RoofMatl","BsmtCond","BsmtFinType2","BsmtFinSF2",
"Heating","LowQualFinSF","KitchenAbvGr","Functional","GarageQual","GarageCond","EnclosedPorch","X3SsnPorch",
"ScreenPorch","PoolArea","MiscVal")
total <- total[,!(names(total) %in% removeVar)]
total$MoSold <- as.factor(total$MoSold)
total$YrSold <- as.factor(total$YrSold)
total$HalfBath <- as.factor(total$HalfBath)
total$FullBath <- as.factor(total$FullBath)
total$GarageCars <- as.factor(total$GarageCars)
total$BsmtFullBath <- as.factor(total$BsmtFullBath)
total$BsmtHalfBath <- as.factor(total$BsmtHalfBath)
total$Fireplaces <- as.factor(total$Fireplaces)
train %>% select(Neighborhood, SalePrice) %>% ggplot(aes(factor(Neighborhood), SalePrice)) + geom_boxplot() +
theme(axis.text.x = element_text(angle = 90, hjust =1)) + xlab('Neighborhoods')
train %>% select(CentralAir, SalePrice) %>% ggplot(aes(factor(CentralAir), SalePrice)) + geom_boxplot() + xlab('CentralAir')
y_train <- train$SalePrice # save a copy of SalePrice for later
total$SalePrice <- NULL
train %>% select(FullBath, SalePrice) %>% ggplot(aes(factor(FullBath), SalePrice)) + geom_boxplot() + xlab('FullBath')
train %>% select(GarageFinish, SalePrice) %>% ggplot(aes(factor(GarageFinish), SalePrice)) + geom_boxplot() + xlab('GarageFinish')
y_train <- train$SalePrice # save a copy of SalePrice for later
total$SalePrice <- NULL
total$Id <- NULL
feature_classes <- sapply(names(total),function(x){class(total[[x]])})
numeric_vars <- names(feature_classes[feature_classes!="factor"])
skewed_vars <- sapply(numeric_vars,function(x){skewness(total[[x]],na.rm=TRUE)})
skewed_vars <- skewed_vars[skewed_vars>0.75]
numericdf <- total[,numeric_vars]
numericdf
anyNA(numericdf)
?cor
knitr::opts_chunk$set(echo = TRUE)
load(file="../numericdf.RData") # saved data frame for loading correlation plot
# plot correlations
library(corrplot)
ames_cor <- cor(numericdf,use="pairwise.complete.obs")
col <- colorRampPalette(c("#BB4444","#EE9988","#FFFFFF","#77AADD","#4477AA"))
corrplot(ames_cor,method="shade",shade.col=NA,tl.col="black",tl.cex=0.75,tl.srt=45,col=col(200),
number.digits=1,addCoef.col="black")
# plot correlations
library(corrplot)
ames_cor <- cor(numericdf)
col <- colorRampPalette(c("#BB4444","#EE9988","#FFFFFF","#77AADD","#4477AA"))
corrplot(ames_cor,method="shade",shade.col=NA,tl.col="black",tl.cex=0.75,tl.srt=45,col=col(200),
number.digits=1,addCoef.col="black")
# Setup environment to maximize parellel computing
rm(list=ls())
memory.size(max=T)
library(doParallel)
library(foreach)
c1=makeCluster(2)
registerDoParallel(c1)
# Load the libraries
library(ggplot2)
library(dplyr)
library(data.table)
library(caret)
library(moments)
library(corrplot)
library(VIM)
library(mice)
library(plyr)
library(Amelia)
library(psych)
library(Metrics)
# train data
train <- read.csv("data/train.csv",stringsAsFactors=TRUE)
train_eda <- train # save a copy for eda purposes
str(train)
a <- rep(0,length(ncol(train)))
for(i in 1:ncol(train)){a[i]=sum(is.na(train[,i]))}
a
train <- train[,c(-4,-7,-58,-73,-74,-75)]
str(train)
# test data
test <- read.csv("data/test.csv",stringsAsFactors=TRUE)
str(test)
test$SalePrice=as.integer(rep("",length(test$Id)))
test <- test[,c(-4,-7,-58,-73,-74,-75)]
# combine train and test
total <- rbind(train,test)
a <- rep(0,length(ncol(total)))
for(i in 1:ncol(total)){a[i]=sum(is.na(total[,i]))} # check NAs
a
################### EDA AND DATA PREPROCESSING #####################
# typical eda
describe(total)
hist(total$SalePrice) # need to transform this to be normal
# remove near zero variance features or constant features
nearZeroVar(train,saveMetrics = TRUE)
removeVar <- c("Street","LandContour","Utilities","LandSlope","Condition2","RoofMatl","BsmtCond","BsmtFinType2","BsmtFinSF2",
"Heating","LowQualFinSF","KitchenAbvGr","Functional","GarageQual","GarageCond","EnclosedPorch","X3SsnPorch",
"ScreenPorch","PoolArea","MiscVal")
total <- total[,!(names(total) %in% removeVar)]
# change some int columns with 5 levels or less to factors
total$MoSold <- as.factor(total$MoSold)
total$YrSold <- as.factor(total$YrSold)
total$HalfBath <- as.factor(total$HalfBath)
total$FullBath <- as.factor(total$FullBath)
total$GarageCars <- as.factor(total$GarageCars)
total$BsmtFullBath <- as.factor(total$BsmtFullBath)
total$BsmtHalfBath <- as.factor(total$BsmtHalfBath)
total$Fireplaces <- as.factor(total$Fireplaces)
# plot of SalePrice by Neighborhood
train %>% select(Neighborhood, SalePrice) %>% ggplot(aes(factor(Neighborhood), SalePrice)) + geom_boxplot() +
theme(axis.text.x = element_text(angle = 90, hjust =1)) + xlab('Neighborhoods')
# plot of SalePrice by FullBath
train %>% select(FullBath, SalePrice) %>% ggplot(aes(factor(FullBath), SalePrice)) + geom_boxplot() + xlab('FullBath')
# plot of SalePrice by GarageFinish
train %>% select(GarageFinish, SalePrice) %>% ggplot(aes(factor(GarageFinish), SalePrice)) + geom_boxplot() + xlab('GarageFinish')
# plot of SalePrice by CentralAir
train %>% select(CentralAir, SalePrice) %>% ggplot(aes(factor(CentralAir), SalePrice)) + geom_boxplot() + xlab('CentralAir')
# save SalePrice separately and remove it and Id from combined dataset
y_train <- train$SalePrice # save a copy of SalePrice for later
total$SalePrice <- NULL
total$Id <- NULL
# get numeric variables and determine the variables that have high skew
feature_classes <- sapply(names(total),function(x){class(total[[x]])})
numeric_vars <- names(feature_classes[feature_classes!="factor"])
skewed_vars <- sapply(numeric_vars,function(x){skewness(total[[x]],na.rm=TRUE)})
skewed_vars <- skewed_vars[skewed_vars>0.75]
# impute missing values using mice package
missmap(total, main="Missing Map")
imp <- mice(total,m=3,method="cart",maxit=1)
total <- complete(imp)
missmap(total)
anyNA(total) # no more NA
numericdf <- total[,numeric_vars]
save(numericdf,file="numericdf.RData") # save for loading into Rmd file
ames_cor <- cor(numericdf)
col <- colorRampPalette(c("#BB4444","#EE9988","#FFFFFF","#77AADD","#4477AA"))
corrplot(ames_cor,method="shade",shade.col=NA,tl.col="black",tl.srt=45,col=col(200),addCoef.col="black",order="AOE")
train[numeric_vars,]
train[,numeric_vars]
anyNA(train[,numeric_vars])
y_train <- log(y_train+1)
cat_vars <- names(feature_classes[feature_classes=="factor"])
dummies <- dummyVars(~.,total[cat_vars])
cat_encode <- predict(dummies,total[cat_vars])
cat_encode[is.na(cat_encode)] <- 0
for(x in names(skewed_vars)) {
total[[x]] <- log(total[[x]]+1)
}
total <- cbind(total[numeric_vars],cat_encode)
x_train <- total[1:nrow(train),]
x_test <- total[(nrow(train)+1):nrow(total),]
# plot correlations
library(corrplot)
ames_cor <- cor(numericdf)
col <- colorRampPalette(c("#BB4444","#EE9988","#FFFFFF","#77AADD","#4477AA"))
corrplot(ames_cor,method="shade",shade.col=NA,tl.col="black",tl.cex=0.75,tl.srt=45,col=col(200),
number.digits=1,addCoef.col="black")