<div >
<img src = "../banner.jpg" />
</div>

# Spatial Data

In [None]:
require("pacman")
p_load("tidyverse","sf","modeldata","geojsonio")

In [None]:
data("ames", package = "modeldata")

In [None]:
head(ames)

In [None]:
dim(ames)

In [None]:
class(ames)

![](figs/mercator.gif)

In [None]:
ames_sf <- sf::st_as_sf(
  ames,
  # "coords" is in x/y order -- so longitude goes first!
  coords = c("Longitude", "Latitude"),
  remove=FALSE,
  # Set our coordinate reference system to EPSG:4326,
  # the standard WGS84 geodetic coordinate reference system
  crs = 4326
)

In [None]:
#?st_as_sf

In [None]:
class(ames_sf)

In [None]:
head(ames_sf)

In [None]:
#graficar con ggplot
ggplot() +
    geom_sf(data=ames_sf)+
    theme_bw()

In [None]:
p_load("leaflet")

In [None]:
map1<-leaflet()  %>% 
        addTiles()  %>% 
        addCircleMarkers(data=ames_sf)
map1

In [None]:
#workaround to show in Jupyter Notebook (not needed in Rstudio)
p_load("htmlwidgets","IRdisplay")

saveWidget(map1, 'demo1.html', selfcontained = FALSE)
display_html('<iframe src="demo1.html" width="800" height="800"></iframe>')

In [None]:
#Different Tiles
map2<-leaflet()  %>% 
    addProviderTiles(providers$Stamen.Toner)  %>% 
    addCircles(data=ames_sf)

In [None]:
#workaround to show in Jupyter Notebook (not needed in Rstudio)
p_load("htmlwidgets","IRdisplay")

saveWidget(map2, 'demo2.html', selfcontained = FALSE)
display_html('<iframe src="demo2.html" width="800" height="800"></iframe>')

## Spatial Autocorrelation

This relationship may exhibit spatial autocorrelation across the city of Ames, and we can use any of the several different methods provided by spatialsample to try and investigate it. 
This relationship may exhibit spatial autocorrelation across the city of Ames, and we can use any of the several different methods provided by spatialsample to try and investigate it. 


### Spatial Buffers

The `spatial_buffer_vfold_cv()` function will perform [spatially buffered cross-validation](https://onlinelibrary.wiley.com/doi/10.1111/geb.12161) with your data:


In [None]:
p_load("spatialsample")

set.seed(123)
buffer_folds <- spatial_buffer_vfold_cv(ames_sf, radius=40,buffer=5)

autoplot(buffer_folds)

### Spatial Blocks

For instance, the `spatial_block_cv()` function will perform [spatial blocking](https://doi.org/10.1111/ecog.02881) with your data:

In [None]:
set.seed(123)
block_folds <- spatial_block_cv(ames_sf, v = 5)

autoplot(block_folds)

### Spatial LLOCV

If you already have a sense of what locations in your data are likely to be closely related, you can also use the `spatial_leave_location_out_cv()` function to perform [leave-location-out cross-validation](https://doi.org/10.1016/j.envsoft.2017.12.001). 

For instance, we can split the Ames data into folds based on neighborhoods using this function:

In [None]:
set.seed(123)

location_folds <- 
  spatial_leave_location_out_cv(
    ames_sf,
    group = Neighborhood
  )

In [None]:
autoplot(location_folds)

In [None]:
p_load("purrr")

walk(location_folds$splits, function(x) print(autoplot(x)))

## Full implementation with Elastic Net

\begin{align}
min_{\beta} EN(\beta) &= \sum_{i=1}^n (y_i-\beta_0 - \sum_{j=1}^p x_{ij}\beta_j)^2  + \lambda\left(\alpha \sum_{j=1}^p |\beta_j| + \frac{(1-\alpha)}{2} \sum_{j=1}^p (\beta_j)^2\right)
\end{align}

In [None]:
folds<-list()

for(i in 1:10){
  folds[[i]]<- location_folds$splits[[i]]$in_id
}


In [None]:
head(folds[[1]])

In [None]:
folds[[2]][!(folds[[2]]%in%folds[[1]])]

In [None]:
p_load("caret")

fitControl<-trainControl(method ="cv",
                         index=folds)



In [None]:
EN<-train(log(Sale_Price) ~ Gr_Liv_Area  +  Bldg_Type ,
             data=ames_sf,
             method = 'glmnet', 
             trControl = fitControl,
             tuneGrid = expand.grid(alpha =seq(0,1,length.out = 20),
                                    lambda = seq(0.001,0.2,length.out = 50))
              ) 

In [None]:
EN

In [None]:
EN$bestTune

In [None]:
round(EN$results$RMSE[which.min(EN$results$lambda)],4)

In [None]:
set.seed(123)

fitControl2<-trainControl(method ="cv",
                         number=5)

EN2<-train(log(Sale_Price) ~ Gr_Liv_Area  +  Bldg_Type ,
             data=ames_sf,
             method = 'glmnet', 
             trControl = fitControl2,
             tuneGrid = expand.grid(alpha =seq(0,1,length.out = 20),
                                    lambda = seq(0.001,0.2,length.out = 50))
              ) 



In [None]:
round(EN2$results$RMSE[which.min(EN2$results$lambda)],4)

## Example Problem Set

In [None]:
test<- ames_sf  %>% filter(Neighborhood=="North_Ames")

train<-ames_sf  %>% filter(Neighborhood!="North_Ames")

In [None]:
set.seed(123)

location_folds_train <- 
  spatial_leave_location_out_cv(
    train,
    group = Neighborhood
  )



In [None]:
autoplot(location_folds_train)

In [None]:
folds_train<-list()
for(i in 1:length(location_folds_train$splits)){
  folds_train[[i]]<- location_folds_train$splits[[i]]$in_id
}


In [None]:
fitControl_tp_random<-trainControl(method ="cv",
                         number=5)

fitControl_spatial<-trainControl(method ="cv",
                         index=folds_train)

In [None]:
set.seed(123)

EN_tp_random<-train(log(Sale_Price) ~ Gr_Liv_Area:Bldg_Type ,
             data=train,
             method = 'glmnet', 
             trControl = fitControl_tp_random,
             metric="MAE",
             tuneGrid = expand.grid(alpha =seq(0,1,length.out = 10),
                                    lambda = seq(0.001,0.2,length.out = 10))
              ) 

In [None]:
set.seed(123)

EN_tp_spatial<-train(log(Sale_Price) ~ Gr_Liv_Area:Bldg_Type ,
             data=train,
             method = 'glmnet', 
             trControl = fitControl_spatial,
             metric="MAE",
             tuneGrid = expand.grid(alpha =seq(0,1,length.out = 10),
                                    lambda = seq(0.001,0.2,length.out = 10))
              ) 

In [None]:
#EN_tp_random

In [None]:
#EN_tp$bestTune

In [None]:
test$log_price_hat_random<-predict(EN_tp_random,newdata = test)

In [None]:
head(test  %>% select(Sale_Price,log_price_hat_random)  %>% st_drop_geometry())

In [None]:
test$log_price_hat_spatial<-predict(EN_tp_spatial,newdata = test)

In [None]:
test<- test  %>% mutate(price_hat_random=exp(log_price_hat_random),price_hat_spatial=exp(log_price_hat_spatial))

#### What is Kaggle's score?

In [None]:
#MAE
mean(abs(test$Sale_Price-test$price_hat_random))

In [None]:
mean(abs(test$Sale_Price-test$price_hat_spatial))

In [None]:
#MAE
mean(abs(test$Sale_Price-round(test$price_hat_random)))

# Super learner

## Ames data

Vamos a modelar los precios de venta de las casas en el conjunto de datos de Ames. Digamos que el precio de venta de estas casas depende del año en que se construyeron, su superficie habitable (tamaño) y el tipo de casa que son (dúplex vs. townhouse vs. unifamiliar)

In [None]:
train<- train  %>% mutate(logprice=log(Sale_Price))
test<- test  %>% mutate(logprice=log(Sale_Price))

In [None]:
# p_load("caret")
# set.seed(1011)
# inTrain <- createDataPartition(
#   y = ames$logprice,## La variable dependiente u objetivo 
#   p = .7, ## Usamos 70%  de los datos en el conjunto de entrenamiento 
#   list = FALSE)


# train <- ames[ inTrain,]
# test  <- ames[-inTrain,]
# colnames(train)

In [None]:
p_load("SuperLearner")

In [None]:
# Review available models.
listWrappers()

In [None]:
ySL <- train$logprice
XSL <- train  %>% select(Year_Built, Bldg_Type, Gr_Liv_Area) %>% st_drop_geometry()

In [None]:
head(XSL)

In [None]:
sl.lib <- c("SL.randomForest", "SL.lm") #lista de los algoritmos a correr

# Fit using the SuperLearner package,

fitY <- SuperLearner(Y = ySL,  X= data.frame(XSL),
                     method = "method.NNLS", # combinación convexa
                     SL.library = sl.lib)

fitY

In [None]:
test <- test  %>%  mutate(yhat_Sup=predict(fitY, newdata = data.frame(test), onlySL = T)$pred)
head(test$yhat_Sup)

In [None]:
test<- test  %>% mutate(price_hat_Sup=exp(yhat_Sup))

In [None]:
mean(abs(test$Sale_Price-round(test$price_hat_Sup)))

## Test algorithm with multiple hyperparameter settings

The performance of an algorithm varies based on its hyperparamters, which again are its configuration settings. Some algorithms may not vary much, and others might have far better or worse performance for certain settings. Often we focus our attention on 1 or 2 hyperparameters for a given algorithm because they are the most important ones.

For random forest there are two particularly important hyperparameters: mtry and maximum leaf nodes. Mtry is how many features are randomly chosen within each decision tree node - in other words, each time the tree considers making a split. Maximum leaf nodes controls how complex each tree can get.

Let's try 3 different mtry options.

In [None]:
# Customize the defaults for random forest.
custon_ranger = create.Learner("SL.ranger", params = list(num.trees = 1000))

# Look at the object.
custon_ranger$names


In [None]:
custom_rf = create.Learner("SL.randomForest",
                     tune = list(mtry = round(c(1, sqrt(4), 3))))
custom_rf$names

In [None]:
# Customize the defaults for random forest.
custon_glmnet = create.Learner("SL.glmnet", tune = list(alpha = seq(0, 1, length.out=5)))

# Look at the object.
custon_glmnet$names

In [None]:
sl.lib2 <- c("SL.randomForest", "SL.lm",custon_ranger$names,custon_glmnet$names,custom_rf$names)
sl.lib2

In [None]:
# Fit (takes forever)

fitY_long <- SuperLearner(Y = ySL, X = data.frame(XSL),
     method = "method.NNLS", SL.library = sl.lib2)

fitY_long

# Spatial Cross Validation


In [None]:
p_load("spatialsample")

ames_sf <- sf::st_as_sf(
  ames,
  # "coords" is in x/y order -- so longitude goes first!
  coords = c("Longitude", "Latitude"),
  # Set our coordinate reference system to EPSG:4326,
  # the standard WGS84 geodetic coordinate reference system
  crs = 4326
)


In [None]:
set.seed(123)
block_folds <- spatial_block_cv(ames_sf, v = 15)

In [None]:
autoplot(block_folds) + theme_bw()

In [None]:

set.seed(123)
cluster_folds <- spatial_clustering_cv(ames_sf, v = 15)
autoplot(cluster_folds) + theme_bw()

In [None]:
set.seed(123)
location_folds <- 
  spatial_leave_location_out_cv(
   ames_sf,
    group = Neighborhood,
    v = 15
  )

In [None]:
autoplot(location_folds)+ theme_bw()

In [None]:
table(ames_sf$Neighborhood)


In [None]:
ames_sf <- ames_sf   %>% mutate(Neighborhood=droplevels(Neighborhood))

In [None]:
table(ames_sf$Neighborhood)

In [None]:
length(unique(ames_sf$Neighborhood))

In [None]:
test_neigh<- ames_sf  %>% filter(Neighborhood=="North_Ames")
test_neigh <- test_neigh   %>% mutate(Neighborhood=droplevels(Neighborhood))
train_neigh<- ames_sf  %>% filter(Neighborhood!="North_Ames")
train_neigh <- train_neigh   %>% mutate(Neighborhood=droplevels(Neighborhood))

In [None]:
y_neigh<-train_neigh$logprice
X_neigh<- train_neigh  %>% select(Year_Built, Bldg_Type, Gr_Liv_Area)  %>% st_drop_geometry()

In [None]:
index <- split(1:nrow(train_neigh),train_neigh$Neighborhood)

In [None]:
index

In [None]:
folds<-length(index)
folds

In [None]:
fitY_neigh <- SuperLearner(Y = y_neigh, X = data.frame(X_neigh),
    method = "method.NNLS", SL.library = sl.lib,
    cvControl = list(V = folds, validRows = index))

In [None]:
fitY_neigh

In [None]:
yhat_SL_neigh<-predict(fitY_neigh, newdata = data.frame(test_neigh), onlySL = T)$pred


In [None]:
test<- test  %>% mutate(price_hat_SL_neigh=exp(yhat_SL_neigh))

In [None]:
mean(abs(test$Sale_Price-round(test$price_hat_SL_neigh)))