# IMÓVEIS RURAIS

O objetivo é estimar valores de imóveis rurais com base em variáveis genéricas de caráter físico (R1) e acrescidas de atributos relacionados ao uso dos imóveis e à qualidade de vida regional (R2).

## Instalação de Pacotes Necessários

In [None]:
%time
import os
import numpy as np
import math
from numpy import mean
from numpy import std
import statsmodels.api as sm
from statsmodels.distributions.empirical_distribution import ECDF

!pip install --upgrade pandas
import pandas as pd
from pandas import read_csv

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict

from sklearn.model_selection import GridSearchCV

from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import RepeatedKFold
from xgboost import XGBRegressor

!pip install --upgrade seaborn
import seaborn as sns

!pip install --upgrade scipy
import scipy as scipy
from scipy import stats

!pip install --upgrade shap
import shap as shap
shap.initjs()

In [None]:
!pip install --upgrade geopandas
import geopandas

!pip install --upgrade pysal
import pysal

!pip install --upgrade contextily
import contextily

import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN

from pysal.model import spreg
from pysal.lib import weights
from pysal.lib import cg as geometry

from spreg import OLS

## Importação de Dados do ME geolocalizados

In [None]:
!pip install --upgrade gspread

In [None]:
# autorização de acesso ao Google Drive 

from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)

In [None]:
planilha = gc.open('nome_basededados_rural')
pagina = planilha.sheet1
pagina.row_values(1)

In [None]:
df_Uniao = pd.DataFrame(pagina.get_all_records())

In [None]:
df_Uniao.head(5)

In [None]:
df_Uniao.columns

In [None]:
df_Uniao.shape

In [None]:
df_Uniao.isna().sum()

In [None]:
df_Uniao = df_Uniao.dropna()

In [None]:
df_Uniao = df_Uniao.reset_index(drop=True)

In [None]:
df_Uniao.dtypes

In [None]:
df_Uniao = df_Uniao.astype({"VTN_INCRA": float})

df_Uniao.dtypes

In [None]:
df_Uniao.describe()

In [None]:
df_Uniao.drop([20], axis=0, inplace=True)

# Análise Imóveis Rurais

In [None]:
df_Uniao.info()

In [None]:
df_Uniao.columns

In [None]:
df_Uniao_transf = df_Uniao.copy()

In [None]:
# R1
# explanatory_vars = ['A2', 'Água', 'AcessoPavimentado','DistanciaZonaUrbana']

# R2
explanatory_vars = ['A2', 'Água', 'AcessoPavimentado', 'VTN_ITR', 'DistanciaZonaUrbana', 'IDHM2010']

In [None]:
mms = MinMaxScaler()

df_Uniao_transf[explanatory_vars] = mms.fit_transform(df_Uniao_transf[explanatory_vars])

In [None]:
LnArea = np.log(df_Uniao.loc[:,'Área'])

df_Uniao_transf.loc[:,'LnArea'] = LnArea
explanatory_vars.append('LnArea')

In [None]:
# Transformação da variável "Valor Total Atualizado"

valoratualizado = np.array(df_Uniao['ValorTotalAtualizado'])
y = np.log(valoratualizado)
y.shape = (len(valoratualizado), 1)

In [None]:
X = []

for i in explanatory_vars:
  X.append(df_Uniao_transf[i])

X = np.array(X).T

In [None]:
print(X[0])

In [None]:
print(X)

In [None]:
len(y)

In [None]:
len(X)

In [None]:
variables = explanatory_vars

### Regressão Linear Múltipla (OLS)

In [None]:
ols = OLS(y, X, name_y = 'ValorAtualizado', name_x = variables)

In [None]:
print(ols.summary)

In [None]:
print(ols.u)

In [None]:
residuos = ols.u

In [None]:
MSE_ols = (ols.utu) /len(X)
RMSE_ols = MSE ** (1/2)

print(RMSE_ols)

In [None]:
X_constante = sm.add_constant(X)
modelo = sm.OLS(y, X_constante).fit()
print(modelo.summary())

In [None]:
predictions = modelo.predict(X_constante)

In [None]:
# Cálculo das Distâncias de Cook

np.set_printoptions(suppress=True)

influence = modelo.get_influence()

cooks = influence.cooks_distance

print(cooks)

In [None]:
resultado = modelo.get_prediction(X_constante)
ic_regressao = resultado.conf_int()
print(ic_regressao)

In [None]:
plt.figure(figsize=(14, 8))
plt.scatter(df_Uniao.index, cooks[0])
plt.xlabel('x')
plt.ylabel('Distância de Cook')
plt.show()

In [None]:
df_Cook = pd.DataFrame(cooks[0], columns = ['DistanciaCook'])

In [None]:
df_Cook.query('DistanciaCook > 1.0')

In [None]:
predictions = modelo.get_prediction(X_constante)
confidence_intervals = predictions.summary_frame(0.2)
confidence_intervals

### Regressão Espacial (S2SLS)

In [None]:
# Criação de Gráfico de Densidade de Imóveis Rurais Anotados

f, ax = plt.subplots(1, figsize=(20, 20))

sns.kdeplot(
    x = df_Uniao["xCoord"],
    y = df_Uniao["yCoord"],
    n_levels=4,
    fill=True,
    alpha=0.4,
    cmap="YlOrBr"
)

contextily.add_basemap(
    ax, source=contextily.providers.CartoDB.Positron,crs="EPSG:4326"
)

plt.xlim([-80,-30])
plt.ylim([-40,10])

plt.show()

In [None]:
df_Uniao_geo = df_Uniao.copy()
df_Uniao_geo["geometry"] = geopandas.points_from_xy(df_Uniao_geo["xCoord"], df_Uniao_geo["yCoord"])
df_Uniao_geo = geopandas.GeoDataFrame(df_Uniao_geo, crs="epsg:3857")

In [None]:
radius = geometry.sphere.RADIUS_EARTH_KM
radius

In [None]:
w = weights.DistanceBand.from_dataframe(df_Uniao_geo, threshold=21.0, binary=False, radius = radius, alpha = -1.0)
w.transform = 'r'

In [None]:
df_Uniao_geo.corr()

In [None]:
df_Uniao_corr = df_Uniao_transf[explanatory_vars]

In [None]:
legenda = ["Potencial A2", "Cursos d'Água", "Acesso Pavimentado", "VTN RFB", "Distância Zona Urbana", "IDHM", "Ln(Área do Terreno)"]

In [None]:
sns.set(font_scale=1.4)
plt.figure(figsize=(25, 25))
heatmap = sns.heatmap(df_Uniao_corr.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG', xticklabels=legenda, yticklabels=legenda)
heatmap.set_title('Mapa de Calor de Correlações de Pearson', fontdict={'fontsize':18}, pad=12)

In [None]:
lmgeo = spreg.GM_Lag(y, X, w=w, w_lags=1, spat_diag=True, name_y = 'LnValorTotalAtualizado', name_x = variables)

In [None]:
print(lmgeo.summary)

In [None]:
MSE_re = lmgeo.utu/len(X)
RMSE_re = MSE ** (1/2)

print(RMSE_re)

In [None]:
residuosespaciais = lmgeo.u

### Análise dos Intervalos de Confiança

In [None]:
media_residuos = np.mean(residuos)
media_residuos

In [None]:
desvpad_residuos = np.std(residuos)
desvpad_residuos 

In [None]:
ecdf_ols = ECDF(residuos.ravel())

In [None]:
print('P(x<0): %.3f' % ecdf_ols(0))

In [None]:
ecdf_ols_neg = ecdf_ols(0)

In [None]:
len(y)

In [None]:
ecdf_re = ECDF(residuosespaciais.ravel())

In [None]:
print('P(x<0): %.3f' % ecdf_re(0))

In [None]:
ecdf_re_neg = ecdf_re(0)

In [None]:
# 22 graus de liberdade

t_student_bicaudal_90 = 1.717

In [None]:
# IC Regressão Linear Múltipla (OLS)

dif_relativa_sup = math.exp(RMSE_ols * (1 - ecdf_ols_neg) + (t_student_bicaudal_90 * desvpad_residuos / (len(y)**(0.5))))
dif_relativa_inf = math.exp(-RMSE_ols * ecdf_ols_neg - (t_student_bicaudal_90 * desvpad_residuos / (len(y)**(0.5))))

print("Semiamplitude Superior do IC 90 R2: %0.4f" % (dif_relativa_sup - 1))
print("Semiamplitude Inferior do IC 90 R2: %0.4f" % (dif_relativa_inf - 1))

In [None]:
media_residuosespaciais = np.mean(residuosespaciais)
desvpad_residuosespaciais = np.std(residuosespaciais)

In [None]:
# IC Regressão Espacial (S2SLS)

dif_relativa_sup = math.exp(RMSE_re * (1 - ecdf_re_neg) + (t_student_bicaudal_90 * desvpad_residuosespaciais / (len(y)**(0.5))))
dif_relativa_inf = math.exp(- RMSE_re * ecdf_ols_neg - (t_student_bicaudal_90 * desvpad_residuosespaciais / (len(y)**(0.5))))

print("Semiamplitude Superior do IC 90 R2: %0.4f" % (dif_relativa_sup - 1))
print("Semiamplitude Inferior do IC 90 R2: %0.4f" % (dif_relativa_inf - 1))

### Exportação

In [None]:
df_Uniao.columns

In [None]:
df_Uniao.head()

In [None]:
df_Uniao.to_csv('teste_rurais.csv', index = False, header=True)