### 1. Importing and Loading Required Libraries

#### 1.1. Installing and Loading Required Packages
This section installs and loads the essential R packages for data analysis and visualization. The `ggcorrplot` package creates correlation plots, `car` offers regression diagnostics, and `HH` provides additional statistical methods. The `tidyverse` is used for data manipulation, `ggplot2` for visualization, and both `gridExtra` and `cowplot` help arrange multiple plots.

In [70]:
#install.packages("showtext")
#install.packages("ggcorrplot")
#install.packages("car")
#install.packages("HH")
#install.packages("lmtest")
#install.packages("nortest")
#install.packages("dplyr")
#install.packages("lubridate")
# Load necessary libraries
#library(showtext)
#library(tidyverse)
#library(ggplot2)
#library(gridExtra)
#library(cowplot)
#library(dplyr)
#library(lubridate)
#library(ggcorrplot)
#library(car)
#library(HH)
#library(lmtest)
#library(nortest)

### 2. Data Preparation and Transformation

#### 2.1. Reading the Data
Reading the data from the CSV file and displaying the first few rows to understand the structure of the data.

In [71]:
data_sell <- read.csv("/Users/karimmbk/Documents/halef-thesis/regression/resources/Result_8_out.csv", dec = ".", header = TRUE, sep = ",")

#### 2.2. Convert categorical variables to factors (dummy variables)
Converting categorical variables to factors to use them in the regression model.

In [72]:
data_sell$gym <- as.factor(data_sell$gym)
data_sell$field_quadra <- as.factor(data_sell$field_quadra)
data_sell$elevator <- as.factor(data_sell$elevator)
data_sell$furnished <- as.factor(data_sell$furnished)
data_sell$swimming_pool <- as.factor(data_sell$swimming_pool)

#### 2.3. Convert date to date format
Converting the date column to a date format to filter the data by year.

In [73]:
data_sell$date <- as.Date(data_sell$date, format = "%d/%m/%Y")

#### 2.4. Filter the data by year

In [74]:
data_sell_2018 <- filter(data_sell, year == 2018)
data_sell_2019 <- filter(data_sell, year == 2019)
data_sell_2020 <- filter(data_sell, year == 2020)
data_sell_2021 <- filter(data_sell, year == 2021)

data_sell_final <- data_sell_2020
print(nrow(data_sell))
print(nrow(data_sell_final))

[1] 32712
[1] 2258


In [75]:
remove_outliers <- function(df, column) {

  # Inicializa o DataFrame filtrado como o original
  df_filtered <- df

  # Remoção de outliers na coluna preco_mes, se existir
  q1_preco_mes <- quantile(df$price_m2, 0.25, na.rm = TRUE)
  q3_preco_mes <- quantile(df$price_m2, 0.75, na.rm = TRUE)
  iqr_preco_mes <- q3_preco_mes - q1_preco_mes
  lower_bound_preco_mes <- q1_preco_mes - 1.5 * iqr_preco_mes
  upper_bound_preco_mes <- q3_preco_mes + 1.5 * iqr_preco_mes

  # Filtra o DataFrame para remover outliers de preco_mes
  df_filtered <- df_filtered[df_filtered$price_m2 >= lower_bound_preco_mes & df_filtered$price_m2 <= upper_bound_preco_mes, ]

  return(df_filtered)

}

df_filtered <- remove_outliers(data_sell_final, "price_m2")
print(nrow(data_sell_final) - nrow(df_filtered))
print(nrow(df_filtered))
summary(df_filtered)

[1] 59
[1] 2199


    quarter           year           date               area_m2      
 Min.   :1.000   Min.   :2020   Min.   :2020-01-01   Min.   : 22.00  
 1st Qu.:1.000   1st Qu.:2020   1st Qu.:2020-03-25   1st Qu.: 60.00  
 Median :2.000   Median :2020   Median :2020-06-25   Median : 73.00  
 Mean   :2.451   Mean   :2020   Mean   :2020-06-30   Mean   : 84.16  
 3rd Qu.:3.500   3rd Qu.:2020   3rd Qu.:2020-09-27   3rd Qu.: 93.00  
 Max.   :4.000   Max.   :2020   Max.   :2020-12-31   Max.   :490.00  
                                                                     
    bedrooms         suite         bathrooms         garage     
 Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
 1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000  
 Median :2.000   Median :1.000   Median :1.000   Median :2.000  
 Mean   :2.445   Mean   :1.095   Mean   :1.471   Mean   :2.007  
 3rd Qu.:3.000   3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:2.000  
 Max.   :6.000   Max.   :4.000   Max.   :4.000   M

### 3. Model Specification and Fitting
 Analyzing the correlation between the variables to understand the relationship between them.

In [76]:
#corr_data <- data_sell_final[, (names(data_sell_final) %in% c("price_real_month", "area_m2", "bedrooms", "bathrooms", "garage", "condo_real", "metro_dist_km", "delta_cbd_farialima", "teleworkable", "inequality_meter", "suite"))]

# Compute correlation at 2 decimal places
#corr_matrix <- round(cor(corr_data), 2)
#ggcorrplot(corr_matrix, hc.order = TRUE, type = "lower", lab = TRUE)

#### 3.1. Defining the Model parameters
This subsection defines the formula for the regression model, specifying `price_m2` as the dependent variable and various property features as independent variables.

In [77]:
price_m2_log <- log(df_filtered$price_m2)
area_m2_log <- log(df_filtered$area_m2)
params <- (price_m2 ~ area_m2 +
        bedrooms +
        metro_dist_km +
        delta_cbd_farialima +
        inequality_meter +
        teleworkable +
        garage +
        gym +
        field_quadra +
        elevator +
        swimming_pool)

#data_sell_final_2018 <- filter(data_sell_final, year(date) == 2018)
reg <- lm(params, data = df_filtered)

#### 3.2 Checking the results for the linear regression model
Checking the results for the linear regression model to understand the relationship between the variables.

In [78]:
summary(reg)


Call:
lm(formula = params, data = df_filtered)

Residuals:
    Min      1Q  Median      3Q     Max 
-5800.2 -1288.6  -195.6  1158.8  5748.6 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)         8022.9395   430.9576  18.617  < 2e-16 ***
area_m2                0.3598     1.3931   0.258  0.79619    
bedrooms            -825.1786    77.4370 -10.656  < 2e-16 ***
metro_dist_km       -138.5291    34.1237  -4.060 5.09e-05 ***
delta_cbd_farialima -244.2393    14.2850 -17.098  < 2e-16 ***
inequality_meter      45.0853     5.4920   8.209 3.83e-16 ***
teleworkable        4209.4659   561.8768   7.492 9.93e-14 ***
garage               608.9246    68.6545   8.869  < 2e-16 ***
gym1                 625.8885   218.6729   2.862  0.00425 ** 
field_quadra1       -145.7537   142.7733  -1.021  0.30743    
elevator1           -506.8195   164.2971  -3.085  0.00206 ** 
swimming_pool1       156.5781   226.5260   0.691  0.48951    
---
Signif. codes:  0 ‘***’ 0.001 ‘**

#### 3.1.1 Running the diagnostic tests
Running the diagnostic tests to check the normality, heteroskedasticity, and autocorrelation of the residuals

In [79]:
# Perform Shapiro-Wilk test
residuals_best_model <- resid(reg)
sample_size <- length(residuals_best_model)

if (sample_size >= 3 && sample_size <= 5000) {
  shapiro_test <- shapiro.test(residuals_best_model)
  print(shapiro_test)
  if (shapiro_test$p.value > 0.01) {
    print("Pass: Residuals are normally distributed (fail to reject H0).")
  } else {
    print("Residuals are not normally distributed (reject H0).")
  }
} else {
  print("Sample size out of range for Shapiro-Wilk test, using Anderson-Darling test instead.")
  ad_test <- ad.test(residuals_best_model)
  print(ad_test)
  if (ad_test$p.value > 0.01) {
    print("Pass: Residuals are normally distributed (fail to reject H0).")
  } else {
    print("Residuals are not normally distributed (reject H0).")
  }
}

# Perform Breusch-Pagan test
bp_test <- bptest(reg)
print(bp_test)
if (bp_test$p.value < 0.01) {
  print("Heteroskedasticity detected (reject H0).")
} else {
  print("Pass: No heteroskedasticity detected (fail to reject H0).")
}

# Perform Durbin-Watson test
dw_test <- dwtest(reg)
print(dw_test)
dw_stat <- dw_test$statistic
if (dw_stat < 1.5) {
  print("Positive autocorrelation detected.")
} else if (dw_stat > 2.5) {
  print("Negative autocorrelation detected.")
} else {
  print("Pass: No autocorrelation detected.")
}


	Shapiro-Wilk normality test

data:  residuals_best_model
W = 0.98833, p-value = 4.05e-12

[1] "Residuals are not normally distributed (reject H0)."

	studentized Breusch-Pagan test

data:  reg
BP = 324.41, df = 11, p-value < 2.2e-16

[1] "Heteroskedasticity detected (reject H0)."

	Durbin-Watson test

data:  reg
DW = 1.1891, p-value < 2.2e-16
alternative hypothesis: true autocorrelation is greater than 0

[1] "Positive autocorrelation detected."
