In [2]:
install.packages("dplyr")

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



In [3]:
############################################################
# SETUP (ejecuta primero)
############################################################
# Si no los tienes en Colab, descomenta:
# install.packages("readr")
# install.packages("dplyr")

library(readr)
library(dplyr)

set.seed(123)
options(scipen = 999)  # menos notación científica

# Carga de datos (ajusta el nombre si es distinto)
df <- read_csv("apartments.csv")

# (Opcional) vistazo rápido
# head(df, 5)
# colSums(is.na(df))


[1mRows: [22m[34m110191[39m [1mColumns: [22m[34m21[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (9): id, type, ownership, buildingmaterial, hasparkingspace, hasbalcony...
[32mdbl[39m (12): price, month, area, rooms, centredistance, schooldistance, clinicd...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [4]:
############################################################
# 3a CLEANING
############################################################

# 1) area2 = area^2
df <- df %>% mutate(area2 = area^2)

# 2) Convertir 'yes'/'no' -> 1/0 en las columnas indicadas
bin_cols <- c("hasparkingspace","hasbalcony","haselevator","hassecurity","hasstorageroom")
bin_cols <- intersect(bin_cols, names(df))

# Aseguramos minúsculas y mapeamos; cualquier otro valor/NA -> 0
df <- df %>%
  mutate(across(all_of(bin_cols), ~ ifelse(tolower(as.character(.)) == "yes", 1L,
                                           ifelse(tolower(as.character(.)) == "no", 0L, 0L))))

# 3) Dummies para el último dígito de 'area': end_0 ... end_9
#    - Creamos 'last_digit'
df <- df %>% mutate(last_digit = as.integer(area %% 10))

#    - end_0 ... end_9
for (i in 0:9) {
  df[[paste0("end_", i)]] <- as.integer(df$last_digit == i)
}

# 4) Asegurar tipos numéricos donde corresponde (por si vinieron como texto)
num_cols <- c(
  "price", "area", "area2",
  "schooldistance","clinicdistance","postofficedistance",
  "kindergartendistance","restaurantdistance","collegedistance","pharmacydistance",
  bin_cols
)
num_cols <- intersect(num_cols, names(df))
df <- df %>% mutate(across(all_of(num_cols), ~ suppressWarnings(as.numeric(.))))

# 5) Categóricas a factor (R se encarga de las dummies en la regresión)
cat_cols <- c("month","type","rooms","ownership","buildingmaterial")
cat_cols <- intersect(cat_cols, names(df))
df <- df %>% mutate(across(all_of(cat_cols), ~ as.factor(.)))

# 6) Eliminar filas con NA en variables que usaremos (para evitar errores en lm)
vars_for_model <- c(
  "price", "area","area2",
  "schooldistance","clinicdistance","postofficedistance",
  "kindergartendistance","restaurantdistance","collegedistance","pharmacydistance",
  bin_cols,
  paste0("end_", 0:8),   # Ojo: omitiremos end_9 en la regresión (base)
  cat_cols,
  "last_digit"
)
vars_for_model <- intersect(vars_for_model, names(df))
df_clean <- df %>% select(all_of(vars_for_model)) %>% na.omit()

# (Opcional) Verificación rápida
# str(df_clean)
# colSums(is.na(df_clean))

In [7]:
############################################################
# 3b-i LINEAR MODEL (OLS)
############################################################

# Fórmula: price ~ end_0 + end_1 + ... + end_8 + area + area2 + distancias +
#          binarios + month + type + rooms + ownership + buildingmaterial
end_terms <- paste0("end_", 0:8)   # omite end_9 como base
present_bin <- intersect(bin_cols, names(df_clean))
present_dist <- intersect(c("schooldistance","clinicdistance","postofficedistance",
                            "kindergartendistance","restaurantdistance","collegedistance","pharmacydistance"),
                          names(df_clean))
present_cats <- intersect(cat_cols, names(df_clean))

rhs_terms <- c(end_terms, "area", "area2", present_dist, present_bin, present_cats)
rhs <- paste(rhs_terms, collapse = " + ")
form_full <- as.formula(paste("price ~", rhs))

# Ajuste
model_full <- lm(form_full, data = df_clean)
summary_full <- summary(model_full)
print(summary_full)

# Comentario sobre end_0:
if ("end_0" %in% rownames(coef(summary_full))) {
  coef_end0 <- coef(summary_full)["end_0", "Estimate"]
  pval_end0 <- coef(summary_full)["end_0", "Pr(>|t|)"]
  cat(sprintf("\n[3b-i] end_0 → coef = %.4f | p-valor = %.4g\n", coef_end0, pval_end0))
} else {
  cat("\n[3b-i] end_0 no está en el modelo (posible colinealidad al crear dummies). Revise end_0/end_9.\n")
}


Call:
lm(formula = form_full, data = df_clean)

Residuals:
     Min       1Q   Median       3Q      Max 
-1351990  -181093   -11144   162102  1872269 

Coefficients:
                                Estimate  Std. Error t value
(Intercept)                   108389.733   13108.964   8.268
end_0                          27604.142    5851.078   4.718
end_1                          -6417.613    6315.693  -1.016
end_2                          15111.447    6076.174   2.487
end_3                          46526.432    6125.825   7.595
end_4                          16387.963    6087.362   2.692
end_5                          21068.751    6056.247   3.479
end_6                           7788.229    5948.492   1.309
end_7                          18886.314    5984.596   3.156
end_8                           2287.968    5824.845   0.393
area                           12180.121     395.037  30.833
area2                             13.440       2.522   5.329
schooldistance                 22779.977

In [9]:
############################################################
# 3b-i LINEAR MODEL (OLS)
############################################################

# Fórmula: price ~ end_0 + end_1 + ... + end_8 + area + area2 + distancias +
#          binarios + month + type + rooms + ownership + buildingmaterial
end_terms <- paste0("end_", 0:8)   # omite end_9 como base
present_bin <- intersect(bin_cols, names(df_clean))
present_dist <- intersect(c("schooldistance","clinicdistance","postofficedistance",
                            "kindergartendistance","restaurantdistance","collegedistance","pharmacydistance"),
                          names(df_clean))
present_cats <- intersect(cat_cols, names(df_clean))

rhs_terms <- c(end_terms, "area", "area2", present_dist, present_bin, present_cats)
rhs <- paste(rhs_terms, collapse = " + ")
form_full <- as.formula(paste("price ~", rhs))

# Ajuste
model_full <- lm(form_full, data = df_clean)
summary_full <- summary(model_full)
print(summary_full)

# Comentario sobre end_0:
if ("end_0" %in% rownames(coef(summary_full))) {
  coef_end0 <- coef(summary_full)["end_0", "Estimate"]
  pval_end0 <- coef(summary_full)["end_0", "Pr(>|t|)"]
  cat(sprintf("\n[3b-i] end_0 → coef = %.4f | p-valor = %.4g\n", coef_end0, pval_end0))
} else {
  cat("\n[3b-i] end_0 no está en el modelo (posible colinealidad al crear dummies). Revise end_0/end_9.\n")
}


Call:
lm(formula = form_full, data = df_clean)

Residuals:
     Min       1Q   Median       3Q      Max 
-1351990  -181093   -11144   162102  1872269 

Coefficients:
                                Estimate  Std. Error t value
(Intercept)                   108389.733   13108.964   8.268
end_0                          27604.142    5851.078   4.718
end_1                          -6417.613    6315.693  -1.016
end_2                          15111.447    6076.174   2.487
end_3                          46526.432    6125.825   7.595
end_4                          16387.963    6087.362   2.692
end_5                          21068.751    6056.247   3.479
end_6                           7788.229    5948.492   1.309
end_7                          18886.314    5984.596   3.156
end_8                           2287.968    5824.845   0.393
area                           12180.121     395.037  30.833
area2                             13.440       2.522   5.329
schooldistance                 22779.977

In [11]:
############################################################
# 3b-ii PARTIALLING-OUT (FWL) para end_0
############################################################

# Construimos la misma especificación pero SIN end_0 en Z (covariables)
z_terms <- setdiff(rhs_terms, "end_0")
form_Z <- as.formula(paste("~", paste(z_terms, collapse = " + ")))

# Residuales de y ~ Z  (con intercepto, porque lm lo añade por defecto)
fit_y_Z <- lm(update(form_Z, price ~ .), data = df_clean)
y_res   <- residuals(fit_y_Z)

# Residuales de d ~ Z, donde d = end_0
if (!("end_0" %in% names(df_clean))) {
  stop("end_0 no está en df_clean; revise la creación de dummies del último dígito.")
}
fit_d_Z <- lm(update(form_Z, end_0 ~ .), data = df_clean)
d_res   <- residuals(fit_d_Z)

# Regr. de residuos (sin intercepto) -> coeficiente debe igualar al de OLS en end_0
po_fit <- lm(y_res ~ d_res - 1)
summary_po <- summary(po_fit)
print(summary_po)

# Comparación con OLS completo
if ("end_0" %in% rownames(coef(summary_full))) {
  coef_ols <- coef(summary_full)["end_0", "Estimate"]
  p_ols    <- coef(summary_full)["end_0", "Pr(>|t|)"]
} else {
  coef_ols <- NA; p_ols <- NA
}

coef_po <- coef(summary_po)["d_res", "Estimate"]
p_po    <- coef(summary_po)["d_res", "Pr(>|t|)"]

comp_tab <- data.frame(
  method      = c("Full OLS", "Partialling-out"),
  coef_end_0  = c(coef_ols, coef_po),
  p_value     = c(p_ols,    p_po)
)
print(comp_tab)

cat("\n[3b-ii] ¿Coeficientes (end_0) iguales? ",
    ifelse(is.na(coef_ols), "No disponible (end_0 ausente en OLS).",
           ifelse(all.equal(as.numeric(coef_ols), as.numeric(coef_po)) == TRUE, "Sí", "Aproximadamente sí / revisar diferencias de redondeo")),
    "\n", sep = "")


Call:
lm(formula = y_res ~ d_res - 1)

Residuals:
     Min       1Q   Median       3Q      Max 
-1351990  -181093   -11144   162102  1872269 

Coefficients:
      Estimate Std. Error t value   Pr(>|t|)    
d_res    27604       5849   4.719 0.00000237 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 307500 on 57284 degrees of freedom
Multiple R-squared:  0.0003887,	Adjusted R-squared:  0.0003712 
F-statistic: 22.27 on 1 and 57284 DF,  p-value: 0.000002372

           method coef_end_0        p_value
1        Full OLS   27604.14 0.000002389856
2 Partialling-out   27604.14 0.000002371541

[3b-ii] ¿Coeficientes (end_0) iguales? Sí


In [12]:
############################################################
# 3c-1 ENTRENAR SIN end_0 (excluir obs. cuyo área termina en 0)
############################################################

train_mask <- df_clean$last_digit != 0
df_train   <- df_clean[train_mask, , drop = FALSE]

# Importante:
# - Mantenemos la misma especificación que en 3b-i,
#   pero NO incluimos 'end_0' en la fórmula (de todos modos en train es 0 siempre).
rhs_terms_no_end0 <- setdiff(rhs_terms, "end_0")
rhs_no_end0 <- paste(rhs_terms_no_end0, collapse = " + ")
form_no0 <- as.formula(paste("price ~", rhs_no_end0))

model_no0 <- lm(form_no0, data = df_train)
summary_no0 <- summary(model_no0)
print(summary_no0)


Call:
lm(formula = form_no0, data = df_train)

Residuals:
     Min       1Q   Median       3Q      Max 
-1318869  -180616   -11312   160908  1853346 

Coefficients:
                                Estimate  Std. Error t value
(Intercept)                   113104.151   13819.207   8.185
end_1                          -6158.944    6277.622  -0.981
end_2                          15057.024    6039.582   2.493
end_3                          46911.351    6089.555   7.704
end_4                          16568.223    6051.493   2.738
end_5                          20589.215    6019.576   3.420
end_6                           7616.912    5912.630   1.288
end_7                          18627.093    5949.118   3.131
end_8                           2318.869    5790.006   0.400
area                           11990.717     420.292  28.529
area2                             16.048       2.692   5.962
schooldistance                 17189.381    6552.814   2.623
clinicdistance                -39819.351 

In [13]:
############################################################
# 3c-2 PREDICCIONES PARA TODA LA MUESTRA
############################################################

# Usamos df_clean completo como newdata. Como la fórmula NO tiene end_0,
# no hay problema en predecir para casos con last_digit == 0.
df_clean$price_pred_no0model <- predict(model_no0, newdata = df_clean)

In [14]:
############################################################
# 3c-3 COMPARAR PROMEDIOS (área termina en 0)
############################################################

mask_end0 <- df_clean$last_digit == 0

avg_actual <- mean(df_clean$price[mask_end0], na.rm = TRUE)
avg_pred   <- mean(df_clean$price_pred_no0model[mask_end0], na.rm = TRUE)
premium    <- avg_actual - avg_pred

cat(sprintf("\n[3c-3] Average ACTUAL (end_0): %.2f\n", avg_actual))
cat(sprintf("[3c-3] Average PREDICTED      : %.2f\n", avg_pred))
cat(sprintf("[3c-3] Actual - Predicted     : %.2f\n", premium))

# Comentario (no es test formal, solo interpretación)
if (is.finite(premium)) {
  if (premium > 0) {
    cat("[Comentario] En promedio, los departamentos con área que termina en 0\n",
        "se venden por ENCIMA de lo que predice el modelo (prima positiva).\n", sep = "")
  } else if (premium < 0) {
    cat("[Comentario] En promedio, los departamentos con área que termina en 0\n",
        "se venden por DEBAJO de lo que predice el modelo (prima negativa).\n", sep = "")
  } else {
    cat("[Comentario] No se observa diferencia promedio entre precio real y predicho para end_0.\n")
  }
} else {
  cat("[Comentario] No se pudo calcular la prima (valores no finitos).\n")
}


[3c-3] Average ACTUAL (end_0): 904560.95
[3c-3] Average PREDICTED      : 876881.82
[3c-3] Actual - Predicted     : 27679.13
[Comentario] En promedio, los departamentos con área que termina en 0
se venden por ENCIMA de lo que predice el modelo (prima positiva).
