##  Importing required libraries

In [None]:
library(ggplot2) # Data visualization
library(readr) # CSV file I/O, e.g. the read_csv function
library(gplots)
library(repr)

# Change plot size to 9 x 6
options(repr.plot.width=9, repr.plot.height=6)

list.files("../input")

##  Importing dataset

In [None]:
train <- as.data.frame(read.csv('../input/train.csv'))
ntrain = dim(train)[1]

test <- as.data.frame(read.csv('../input/test.csv'))
ntest = dim(test)[1]

> ##  Finding dimensions of train and test datasets

In [None]:
dim(train);dim(test)

## Head of train dataset

In [None]:
head(train)

## Columns in train dataset

In [None]:
names(train)

## Summary of train dataset

In [None]:
summary(train)

## Identifying numeric and categorical variables

In [None]:
all_cols = names(train)

numeric_columns <- c('TotRmsAbvGrd','GrLivArea','LotFrontage','LotArea','YearBuilt','YearRemodAdd','BsmtUnfSF','TotalBsmtSF','BsmtFinSF1','BsmtFinSF2','FullBath','HalfBath',
                     'BedroomAbvGr','GarageYrBlt','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF', 'ScreenPorch','PoolArea','MiscVal','MoSold','YrSold','MasVnrArea',
                     'X1stFlrSF' ,'X2ndFlrSF','LowQualFinSF','EnclosedPorch','X3SsnPorch','SalePrice')

numeric_columns1 <- c('TotRmsAbvGrd','GrLivArea','LotFrontage','LotArea','YearBuilt','YearRemodAdd','BsmtUnfSF','TotalBsmtSF','BsmtFinSF1','BsmtFinSF1','SalePrice')

numeric_columns2<-c('TotalBsmtSF','1stFlrSF','2ndFlrSF','FullBath','HalfBath','BedroomAbvGr','GarageYrBlt','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF',
                      '3SsnPorch','ScreenPorch','PoolArea','MiscVal','MoSold','YrSold','MasVnrArea','SalePrice')

categorical_columns = setdiff(all_cols, c(numeric_columns,'Id'))
categorical_columns

## Imputing Null values

In [None]:
for(i in all_cols){
    if (is.element(i,categorical_columns)) {
       train[,i] <-sapply(train[,i], function(x) ifelse(is.na(x), 'None', x))
       #train[,i] <-sapply(train[,i], function(x) as.factor(x))  
    } 
    else{
        train[,i] <-sapply(train[,i], function(x) ifelse(is.na(x), median(train[,i], na.rm = TRUE), x))
         } 
}
                           
for(i in names(test)){
    if (is.element(i,categorical_columns)) {
       test[,i] <-sapply(test[,i], function(x) ifelse(is.na(x), 'None', x))
       #test[,i] <-sapply(test[,i], function(x) as.factor(x))  
    } 
    else{
        test[,i] <-sapply(test[,i], function(x) ifelse(is.na(x), median(test[,i], na.rm = TRUE), x))
         } 
}


## Finding correlation between various numerical variables and the SalePrice

In [None]:
library(corrplot)
library(ggcorrplot)
cor_ind = round(cor(train[numeric_columns]),1)
p.mat <- cor_pmat(train[numeric_columns])

library(ggcorrplot)
ggcorrplot(cor_ind, p.mat = p.mat, hc.order = FALSE,
           , insig = "blank", lab = TRUE,lab_size =2) +
theme(axis.text.x=element_text(size=7, angle=45, vjust=1, hjust=1, 
                               margin=margin(-3,0,0,0)),
      axis.text.y=element_text(size=7, margin=margin(0,-3,0,0)),
      panel.grid.major=element_blank(),
      plot.title = element_text( face="bold", size=10, hjust=0)) +
  ggtitle("Correlation between various numeric columns")

## Correlation between various independent variables

In [None]:
 
set.seed(123)
library(dplyr)
library(tidyr)
  
  p = cor(train[numeric_columns]) %>%
    as.data.frame() %>%
    mutate(var1 = rownames(.)) %>%
    gather(var2, value, -var1) %>%
    arrange(desc(value)) %>%
    group_by(value) %>%
    filter(row_number()==1)

p[p['value']>=0.5,]


## Box plots between categorical variables and SalePrice

In [None]:
#Now lets consider correlation of 'SalePrice' with categorical variables
#Consider 'YearBuilt', 'LandContour', 'BldgType', 'RoofStyle', 'Foundation', 'Heating'
library(GGally)
plots = NULL
for (i in categorical_columns){
plots[[i]] = ggally_box(train, aes_string(i,"SalePrice", color = i))
    }

library(grid)
library(gridExtra)
#grid.arrange(plots, ncol = 2)

i=0
grid.arrange(plots[[i+1]],plots[[i+2]],plots[[i+3]],plots[[i+4]],plots[[i+5]],plots[[i+6]],
             plots[[i+7]],plots[[i+8]],plots[[i+9]],plots[[i+10]],plots[[i+11]],plots[[i+12]],
             plots[[i+13]],plots[[i+14]],plots[[i+15]],ncol=3)
i=15
grid.arrange(plots[[i+1]],plots[[i+2]],plots[[i+3]],plots[[i+4]],plots[[i+5]],plots[[i+6]],
             plots[[i+7]],plots[[i+8]],plots[[i+9]],plots[[i+10]],plots[[i+11]],plots[[i+12]],
             plots[[i+13]],plots[[i+14]],plots[[i+15]],ncol=3)

i=30
grid.arrange(plots[[i+1]],plots[[i+2]],plots[[i+3]],plots[[i+4]],plots[[i+5]],plots[[i+6]],
             plots[[i+7]],plots[[i+8]],plots[[i+9]],plots[[i+10]],plots[[i+11]],plots[[i+12]],
             plots[[i+13]],plots[[i+14]],plots[[i+15]],ncol=3)
i=45
grid.arrange(plots[[i+1]],plots[[i+2]],plots[[i+3]],plots[[i+4]],plots[[i+5]],ncol = 2)

## Plotting the histogram for dependent variable 'SalePrice'

In [None]:

data1 = data.frame(train$SalePrice)
names(data1)[1] <- "SalePrice"
data1['LogSalePRice'] = log(train$SalePrice)

out<-list()

for (c in names(data1)){
    x = data1[,c]
   out[[c]] <- ggplot(data.frame(x), aes(x)) + geom_histogram() + ggtitle(c)
}

i=0
grid.arrange(out[[i+1]],out[[i+2]],ncol=2)

## Quintile quintile plots for SalePrice

In [None]:
qqnorm(train$SalePrice);qqline(train$SalePrice);qqnorm(log(train$SalePrice));qqline(log(train$SalePrice))

## Plotting histograms for all the numerical columns

In [None]:
out<-list()

for (c in numeric_columns){
    x = train[,c]
   out[[c]] <- ggplot(data.frame(x), aes(x)) + geom_histogram() + ggtitle(c)
}
i=0
grid.arrange(out[[i+1]],out[[i+2]],out[[i+3]],out[[i+4]],out[[i+5]],out[[i+6]],
             out[[i+7]],out[[i+8]],out[[i+9]],out[[i+10]],out[[i+11]],out[[i+12]],
             out[[i+13]],out[[i+14]],out[[i+15]],ncol=5)
i=15
grid.arrange(out[[i+1]],out[[i+2]],out[[i+3]],out[[i+4]],out[[i+5]],out[[i+6]],
             out[[i+7]],out[[i+8]],out[[i+9]],out[[i+10]],out[[i+11]],out[[i+12]],
             out[[i+13]],out[[i+14]],out[[i+15]],ncol=5)

## Some feature engineering

In [None]:
#Create a combined dataset 
ntrain= dim(train)[1]
ntest= dim(test)[1]
#train_original = data.frame(train)
all_data = rbind(train[,c(1:80)],test)

#Adding one new feature:
for(i in all_cols){
    if (is.element(i,categorical_columns)) {
       all_data[,i] <-sapply(all_data[,i], function(x) as.factor(x)) 
    } 
                             }
                             
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['X1stFlrSF'] + all_data['X2ndFlrSF']

## Splitting the original training dataset into training and testing sets.

In [None]:
ntrain_set = 1100
nvali_set = 360

train_x <- all_data[1:ntrain_set,][-1]
train_y <- log(train[1:ntrain_set,'SalePrice'])
val_x <- all_data[(ntrain_set+1):dim(train)[1],][-1]
val_y <-log(train[(ntrain_set+1):dim(train)[1],'SalePrice'])
test_x = data.frame(test)

## Ridge regression model

In [None]:
library(glmnet)

x_train <- model.matrix(train_y~., cbind(train_x,train_y))[,-1]
x_val<-model.matrix(~.,data=val_x)[,-1]

lambda <- c(0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75)

ridge.mod <- glmnet(x_train, train_y, alpha = 0, lambda = lambda,family="gaussian")

cv.out <- cv.glmnet(x_train, train_y, alpha = 0)

bestlam <- cv.out$lambda.min

ridge.pred <- predict(ridge.mod, s = bestlam, newx = x_val)

plot(cv.out);bestlam;(mean((exp(ridge.pred) - exp(val_y))^2))^0.5


## Lasso regression model.

In [None]:
library(glmnet)

x_train <- model.matrix(train_y~., cbind(train_x,train_y))[,-1]
x_test<-model.matrix(~.,data=val_x)[,-1]

lambda <- c(0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75)

lasso.mod <- glmnet(x_train, train_y, alpha = 1, lambda = lambda,family="gaussian")

cv.out <- cv.glmnet(x_train, train_y, alpha = 1)

bestlam <- cv.out$lambda.min

lasso.pred <- predict(lasso.mod, s = bestlam, newx = x_val)

plot(cv.out);bestlam;(mean((exp(lasso.pred) - exp(val_y))^2))^0.5


## Simple linear regression model

In [None]:
library(glmnet)
x_train <- model.matrix(train_y~., cbind(train_x,train_y))[,-1]
x_test<-model.matrix(~.,data=val_x)[,-1]

lambda <- c(0:0)

linear.mod <- glmnet(x_train, train_y, alpha = 0, lambda = 0,family="gaussian")

cv.out <- cv.glmnet(x_train, train_y, alpha = 0)

bestlam <- cv.out$lambda.min

linear.pred <- predict(linear.mod, s = bestlam, newx = x_val)

bestlam;(mean((exp(linear.pred) - exp(val_y))^2))^0.5


## Simple linear regression model using basic lm method from R

In [None]:
model = lm(train_y~.,data = cbind(train_x,train_y))
summary(model)

In [None]:
## Residual plot for simple lieanr regression model

In [None]:
plot(residuals(model),main = 'Residuals')

## XGBoost Model

In [None]:
require(xgboost)
set.seed(100)

xgb.mod = xgboost(data = as.matrix(x_train), nfold = 5, label = as.matrix(train_y), 
    nrounds = 2000, verbose = FALSE, objective = "reg:linear", eval_metric = "rmse", 
    nthread = 8, eta = 0.01, gamma = 0.0468, max_depth = 6, min_child_weight = 1.7817, 
    subsample = 0.5213, colsample_bytree = 0.4603)
xgb.importance(colnames(as.matrix(x_train)), model = xgb.mod)
## print(xgbFit)

## Predictions
xgb.pred <- predict(xgb.mod, newdata = as.matrix(x_val))
xgb.pred <- predict(xgb.mod, newdata = as.matrix(test_x))
(mean((exp(xgb.pred) - exp(test_y))^2))^0.5;(mean((xgb.pred - val_y)^2))^0.5

In [None]:
c1 = 0.7
final.pred = c1*xgb.pred + (1-c1)*lasso.pred
(mean((exp(final.pred) - exp(test_y))^2))^0.5

In [None]:
x = c(length(numeric_columns), length(categorical_columns))
header <- c('Numeric','Categorical')
columns<- data.frame(header,x)


In [None]:
write.csv(cbind())