In [None]:
install.packages("ggcorrplot")
install.packages("car")
install.packages("HH")
# Load necessary libraries
library(tidyverse)
library(ggplot2)
library(gridExtra)
library(cowplot)
library(ggcorrplot)
library(car)
library(HH)

### Reading the dataset

In [1]:
data_rent <- read.csv("../../resources/clean-data/rent_2018_2023.csv", dec = ",", header = TRUE, sep = ";")

### Convert categorical variables to factors (dummy variables)

In [2]:

data_rent$gym <- as.factor(data_rent$gym)
data_rent$field_quadra <- as.factor(data_rent$field_quadra)
data_rent$elevator <- as.factor(data_rent$elevator)
data_rent$furnished <- as.factor(data_rent$furnished)
data_rent$swimming_pool <- as.factor(data_rent$swimming_pool)



### Convert date to date format

In [None]:
data_rent$date <- as.Date(data_rent$date, format = "%d/%m/%Y")

In [None]:
# filter data by year
data_2018 <- filter(data_rent, year(date) == 2018)
data_2019 <- filter(data_rent, year(date) == 2019)
data_2020 <- filter(data_rent, year(date) == 2020)
data_2021 <- filter(data_rent, year(date) == 2021)
data_2022 <- filter(data_rent, year(date) == 2022)
data_2023 <- filter(data_rent, year(date) == 2023)

In [None]:
# defining the model parameters
params <- (price_m2 ~ area_m2 +
  suite +
  bathrooms +
  garage +
  condo_real +
  metro_dist_km +
  delta_cbd_farialima +
  idh +
  gym +
  field_quadra +
  elevator +
  furnished +
  swimming_pool)

In [None]:
# create linear regression models for each year
reg <- lm(params, data = data_rent)
reg_2018 <- lm(params, data = data_2018)
reg_2019 <- lm(params, data = data_2019)
reg_2020 <- lm(params, data = data_2020)
reg_2021 <- lm(params, data = data_2021)
reg_2022 <- lm(params, data = data_2022)
reg_2023 <- lm(params, data = data_2023)

In [None]:
reg_final <- reg
data_rent_final <- data_rent
print(nrow(data_rent_final))

In [None]:
summary(reg_final)

In [None]:
outliers <- outlierTest(reg_final, cutoff = 100, n.max = Inf)
print(outliers)
# Get the row numbers of the outliers
outlier_rows <- as.numeric(names(outliers$rstudent))
print(length(outlier_rows))

In [None]:
# print the outliers rows
print(data_rent_final[outlier_rows, c("price_m2","price_real_month","area_m2", "delta_cbd_farialima", "delta_cbd_paulista", "metro_dist_km")])
summary(data_rent_final$price_m2)
summary(data_rent_final$delta_cbd_farialima)

In [None]:
# Remove the outlier rows from the dataframe
data_rent_wo_outliers <- data_rent_final[-outlier_rows,]
print(nrow(data_rent_final))
print(nrow(data_rent_wo_outliers))
reg_wo_outliers <- lm(params, data = data_rent_wo_outliers)
summary(reg_wo_outliers)

In [None]:
predicted<- predict(reg_wo_outliers)
residuals <- resid(reg_wo_outliers)
stand_predicted <- (predicted - mean(predicted)) / sd(predicted)
stand_residuals <- (residuals - mean(residuals)) / sd(residuals)

plot(stand_predicted, stand_residuals, main = "Standardized residuals plot", xlab = "Standardized predicted value", ylab = "Standardized residuals")
abline(0, 0)

hist(stand_residuals, freq = FALSE)
curve(dnorm, add = TRUE)

ncvTest(reg_wo_outliers)

In [None]:
reduced_data <- data_rent_wo_outliers[, (names(data_rent_wo_outliers) %in% c("price_m2", "area_m2", "bedrooms", "suite", "bathrooms", "garage", "condo_real", "metro_dist_km", "delta_cbd_farialima", "delta_cbd_paulista"))]

# Compute correlation at 2 decimal places
corr_matrix <- round(cor(reduced_data), 2)
ggcorrplot(corr_matrix, hc.order = TRUE, type = "lower", lab = TRUE)