<div >
<img src = "../banner.jpg" />
</div>

<a target="_blank" href="https://colab.research.google.com/github/ignaciomsarmiento/BDML_SS/blob/main/Lecture09/Notebook_SS09_Arboles
.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>


# CARTs, Bagging and Random Forests

## Predicting House Prices

Our objective today is to construct a model to predict house prices. From Rosen's landmark paper "Hedonic Prices and Implicit Markets: Product Differentiation in Pure Competition" (1974), we know that a vector of its characteristics describes a differentiated good.

In the case of a house, these characteristics may include structural attributes (e.g., number of bedrooms), neighborhood public services (e.g., local school quality), and local amenities (e.g., crime, air quality, etc). Thus, we can write the market price of the house as:

$$
Price=f(structural\,attributes,amenities,...)
$$


However, Rosen's theory doesn't tell us much about the functional form of $f$. 

## CARTS

Let's load the packages:

In [None]:
# install.packages("pacman") #run this line if you use Google Colab

In [None]:
#packages
require("pacman")
p_load("tidyverse","ggplot2")

 And the toy data set:

In [None]:
db<-read.csv('https://raw.githubusercontent.com/ignaciomsarmiento/datasets/main/toy_houses.csv')

In [None]:
head(db)

In [None]:
ggplot(db) +
  geom_point(aes(x=habitaciones,y=DCBD),position=position_jitter(width = .05)) +
  scale_x_continuous(breaks=seq(0,8,1)) +
  theme_classic() +
  xlab("Habitaciones") +
  ylab("Distancia al Centro") +
  theme(legend.position =  "none",
      text=element_text(size=20))

### Algorithm


-  Datos: $y_{n\times 1}$  y $X_{n\times k}$ 

-  Definiciones


    -  $j$ es la variable que parte el espacio y  $s$ es el punto de partición

    -  Defina los siguientes semiplanos

\begin{align}
R_1(j,s)=\{X|X_j\leq s\} \,\,\, \& \,\,\, R_2(j,s)=\{X|X_j > s\}
\end{align}

-  *El problema*: buscar la variable de partición $X_j$ y el punto $s$ de forma tal que 


\begin{align}
\underset{j,s}{min} \left[ \underset{y_{R_1}}{min}\sum_{x_i\in R_1(j,s)}(y-y_{R_1})^2+ \underset{y_{R_2}}{min}\sum_{x_i\in R_2(j,s)}(y-y_{R_2})^2\right]
\end{align}



#### Algorithm by hand ("artesanal")

1. Iniciemos por DBCD

In [None]:
MSE_dbcd<-NA

j<-1
for(i in seq(1,2,0.25)){
    #Region 1
  R1<- db %>% filter(DCBD<=i)
  R1<- R1 %>% mutate(c1=mean(price))
  MSEr1<- ifelse(is.na(mean((R1$price-R1$c1)^2)),0,mean((R1$price-R1$c1)^2))
    #Region 2
  R2<- db %>% filter(DCBD>i)
  R2<- R2 %>% mutate(c2=mean(price))
  MSEr2<- ifelse(is.na(mean((R2$price-R2$c2)^2)),0,mean((R2$price-R2$c2)^2))
  
  MSE_dbcd[j]<-MSEr1+MSEr2
  j<-j+1
}

MSE_dbcd

2. Luego por Habitaciones

In [None]:
MSE_hab<-NA

for(i in 0:8){
  R1<- db %>% filter(habitaciones<=i)
  R1<- R1 %>% mutate(c1=mean(price))
  MSEr1<- ifelse(is.na(mean((R1$price-R1$c1)^2)),0,mean((R1$price-R1$c1)^2))
  R2<- db %>% filter(habitaciones>i)
  R2<- R2 %>% mutate(c2=mean(price))
  MSEr2<- ifelse(is.na(mean((R2$price-R2$c2)^2)),0,mean((R2$price-R2$c2)^2))
  
  MSE_hab[i+1]<-MSEr1+MSEr2
  
}
MSE_hab

Mínimo?


In [None]:
MSE<-c(MSE_dbcd,MSE_hab)
MSE[which.min(MSE)]
MSE

<iframe src="m.html"></iframe>

#### Algorithm in R

There are multiple packages, we are going to use `rpart`

In [None]:
p_load("rpart")

In [None]:
mytree<-rpart(log(price)~DCBD+habitaciones,data=db)

In [None]:
mytree

In [None]:
plot(mytree)
text(mytree)

In [None]:
p_load("rpart.plot")

prp(mytree, under = TRUE, branch.lty = 2, yesno = 2, faclen = 0, varlen=15,tweak=1.2,clip.facs= TRUE,box.palette = "Greens",compress=TRUE,ycompress = TRUE,node.fun=function(x, labs, digits, varlen) paste("Precio \n", format(round(exp(mytree$frame$yval), 0), nsmall=0, big.mark=",")))

In [None]:
mytree_full<-rpart(log(price)~DCBD+habitaciones,data=db,cp=-1)

In [None]:
prp(mytree_full, under = TRUE, branch.lty = 2, yesno = 2, faclen = 0, varlen=15,tweak=1.2,clip.facs= TRUE,box.palette = "Greens",compress=TRUE,ycompress = TRUE)

##### With Ames Data Set

In [None]:
p_load("modeldata")

data("ames", package = "modeldata")

ames<-ames  %>% filter(Neighborhood %in%c("North_Ames", "College_Creek", "Old_Town", "Edwards", "Somerset", "Northridge_Heights", "Gilbert", "Sawyer", "Northwest_Ames", "Sawyer_West"))


In [None]:
head(ames)

The description of the variables can be viewed here: https://jse.amstat.org/v19n3/decock/DataDocumentation.txt

In [None]:
class(ames$Fence)

In [None]:
amestree<-rpart(log(Sale_Price) ~ Gr_Liv_Area  + Bldg_Type + Fence,data=ames,cp=0.02)

In [None]:
plot(amestree)
text(amestree,pretty=TRUE)

### Cost Complexity Prunning

In [None]:
 amestree$cptable

In [None]:
amestree<-rpart(log(Sale_Price) ~ Gr_Liv_Area  + Bldg_Type ,data=ames)

In [None]:
 amestree$cptable

### With Caret

In [None]:
p_load("caret")

In [None]:
fitControl<-trainControl(method ="cv",
                         number=5)


In [None]:
set.seed(123)
tree <- train(
    log(Sale_Price) ~ Gr_Liv_Area  + Bldg_Type+ Fence,
    data=ames,
    method = "rpart",
    trControl = fitControl
)

In [None]:
tree

In [None]:
prp(tree$finalModel)

#### Tunning Hiper-parameters

#####  `method=rpart`  only allows to tune Complexity Parameter

- Can change the length


In [None]:
set.seed(123)
tree_lenght <- train(
    log(Sale_Price) ~ Gr_Liv_Area  + Bldg_Type + Fence,
    data=ames,
    method = "rpart",
    trControl = fitControl,
    tuneLength=100
)

In [None]:
tree_lenght

- Or the grid

In [None]:
set.seed(123)
tree_grid <- train(
    log(Sale_Price) ~ Gr_Liv_Area  + Bldg_Type + Fence,
    data=ames,
    method = "rpart",
    trControl = fitControl,
    tuneGrid = expand.grid(cp = seq(0.002584, 0.002586, 0.00000001))
)


In [None]:
tree_grid

#####  `method=rpart2`  allows to tune Max Tree Depth

In [None]:
set.seed(123)
tree_rpart2 <- train(
    log(Sale_Price) ~ Gr_Liv_Area  + Bldg_Type + Fence,
    data=ames,
    method = "rpart2",
    trControl = fitControl,
    tuneGrid = expand.grid(maxdepth = seq(1,30,1))
)

In [None]:
tree_rpart2

More details here: https://topepo.github.io/caret/train-models-by-tag.html#tree-based-model

## Example Problem Set

Data on Kaggle: https://www.kaggle.com/competitions/uniandes-bdml-202313-ps2/overview

In [None]:
test<- read.csv("extra/test.csv")


train<-read.csv("extra/train.csv")

head(test)

In [None]:
test<-test  %>% mutate(sample="test")
train<-train  %>% mutate(sample="train")

db_ps<-rbind(test,train)
table(db_ps$sample)

In [None]:
p_load("sf")
db_ps <- st_as_sf(
  db_ps,
  # "coords" is in x/y order -- so longitude goes first!
  coords = c("lon", "lat"),
  # Set our coordinate reference system to EPSG:4326,
  # the standard WGS84 geodetic coordinate reference system
  crs = 4326
)

In [None]:
p_load("leaflet")

pal <- colorFactor(
  palette = c('red', 'green'),
  domain = db_ps$sample
)

map<-leaflet() %>% 
  addTiles() %>%  #capa base
  addCircles(data=db_ps,col=~pal(sample)) #capa casas
map 

In [None]:
#Extra steps bc of jupyter notebook (if you are in RStudio, you don't need these steps)
p_load("htmlwidgets") 
p_load("IRdisplay")
saveWidget(map, file="m.html")
display_html('<iframe width="900" height="700" src="m.html"></iframe>')

### Distance to CBD?


<div >
<img width="400" height="100" src = "figures/precio_suelo.png" />
</div>

In [None]:
p_load("tmaptools") #needs to install p_load("geojsonio")
cbd <- geocode_OSM("Centro Internacional, Bogotá", as.sf=T) 
cbd

In [None]:
db_ps$DCBD<-st_distance(x = db_ps, y = cbd)

In [None]:
head(db_ps$DCBD)

In [None]:
db_ps  %>% st_drop_geometry() %>% group_by(sample)  %>% summarize(mean(DCBD))

In [None]:
train_data<-db_ps  %>% filter(sample=="train")  %>% select(price,DCBD)  %>% na.omit()

In [None]:
set.seed(123)
tree <- train(
    log(price) ~    DCBD,
    data=train_data,
    method = "rpart",
    trControl = fitControl,
    tuneLength=10
)

In [None]:
test_data<-db_ps  %>% filter(sample=="test")  
test_data$pred_tree<-predict(tree,test_data)

In [None]:
head(test_data  %>% select(property_id,pred_tree))

In [None]:
test_data <- test_data   %>% st_drop_geometry()  %>% mutate(pred_tree=exp(pred_tree))
head(test_data  %>% select(property_id,pred_tree))

In [None]:
submit<-test_data  %>% select(property_id,pred_tree)
submit <- submit  %>% rename(price=pred_tree)
write.csv(submit,"Tree_v1.csv",row.names=FALSE)