In [1]:
# importando os pacotes

import pandas as pd
import numpy as np
import json
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [2]:
# importando o dataset

file = 'datasets/sao-paulo-properties-april-2019.csv'
df = pd.read_csv(file)

In [3]:
# visualizando as 5 primeiras linhas

df.head()

Unnamed: 0,Price,Condo,Size,Rooms,Toilets,Suites,Parking,Elevator,Furnished,Swimming Pool,New,District,Negotiation Type,Property Type,Latitude,Longitude
0,930,220,47,2,2,1,1,0,0,0,0,Artur Alvim/São Paulo,rent,apartment,-23.543138,-46.479486
1,1000,148,45,2,2,1,1,0,0,0,0,Artur Alvim/São Paulo,rent,apartment,-23.550239,-46.480718
2,1000,100,48,2,2,1,1,0,0,0,0,Artur Alvim/São Paulo,rent,apartment,-23.542818,-46.485665
3,1000,200,48,2,2,1,1,0,0,0,0,Artur Alvim/São Paulo,rent,apartment,-23.547171,-46.483014
4,1300,410,55,2,2,1,1,1,0,0,0,Artur Alvim/São Paulo,rent,apartment,-23.525025,-46.482436


In [4]:
# obtendo a quantidade de valores ausentes

df.isnull().sum().values.sum()

0

In [5]:
# obtendo os tipos de dados das colunas

df.dtypes

Price                 int64
Condo                 int64
Size                  int64
Rooms                 int64
Toilets               int64
Suites                int64
Parking               int64
Elevator              int64
Furnished             int64
Swimming Pool         int64
New                   int64
District             object
Negotiation Type     object
Property Type        object
Latitude            float64
Longitude           float64
dtype: object

In [6]:
# removendo o nome da cidade na coluna 'District'

df_clean = df.copy()

df_clean['District'] = df_clean['District'].apply(lambda x : x.split('/')[0])

In [7]:
df_clean.head()

Unnamed: 0,Price,Condo,Size,Rooms,Toilets,Suites,Parking,Elevator,Furnished,Swimming Pool,New,District,Negotiation Type,Property Type,Latitude,Longitude
0,930,220,47,2,2,1,1,0,0,0,0,Artur Alvim,rent,apartment,-23.543138,-46.479486
1,1000,148,45,2,2,1,1,0,0,0,0,Artur Alvim,rent,apartment,-23.550239,-46.480718
2,1000,100,48,2,2,1,1,0,0,0,0,Artur Alvim,rent,apartment,-23.542818,-46.485665
3,1000,200,48,2,2,1,1,0,0,0,0,Artur Alvim,rent,apartment,-23.547171,-46.483014
4,1300,410,55,2,2,1,1,1,0,0,0,Artur Alvim,rent,apartment,-23.525025,-46.482436


In [8]:
# obtendo um resumo estatístico

df_clean.describe()

Unnamed: 0,Price,Condo,Size,Rooms,Toilets,Suites,Parking,Elevator,Furnished,Swimming Pool,New,Latitude,Longitude
count,13640.0,13640.0,13640.0,13640.0,13640.0,13640.0,13640.0,13640.0,13640.0,13640.0,13640.0,13640.0,13640.0
mean,287737.8,689.882331,84.3739,2.312023,2.07368,0.980792,1.393182,0.354179,0.146774,0.51217,0.015616,-22.077047,-43.597088
std,590821.4,757.649363,58.435676,0.777461,0.961803,0.834891,0.829932,0.478281,0.353894,0.49987,0.123988,5.866633,11.487288
min,480.0,0.0,30.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-46.749039,-58.364352
25%,1858.75,290.0,50.0,2.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,-23.594552,-46.681671
50%,8100.0,500.0,65.0,2.0,2.0,1.0,1.0,0.0,0.0,1.0,0.0,-23.552813,-46.637255
75%,360000.0,835.0,94.0,3.0,2.0,1.0,2.0,1.0,0.0,1.0,0.0,-23.51764,-46.56004
max,10000000.0,9500.0,880.0,10.0,8.0,6.0,9.0,1.0,1.0,1.0,1.0,0.0,0.0


In [9]:
# transformando as colunas em variáveis dummy

df_clean = pd.get_dummies(df_clean)

In [10]:
# criando o modelo de Machine Learning

X = df_clean.drop('Price', axis = 1)
y = df_clean['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)

model = RandomForestRegressor(random_state = 42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [11]:
# avaliando o modelo

print(f'r2 {round(r2_score(y_test, y_pred), 4)}')
print(f'MAE {round(mean_absolute_error(y_test, y_pred), 4)}')
print(f'MSE {round(mean_squared_error(y_test, y_pred), 4)}')

r2 0.9258
MAE 47129.2066
MSE 23288827056.3598


In [12]:
# persistindo o modelo em um arquivo

from joblib import dump, load

dump(model, 'model/model.joblib')

['model/model.joblib']

In [13]:
# persistindo os nome das colunas em um arquivo

features = X_train.columns.values

dump(features, 'model/features.names')

['model/features.names']

In [14]:
# carregando os arquivos persistidos

new_model = load('model/model.joblib')
features = load('model/features.names')

In [15]:
# criando um dicionário de dados com os nomes de todas as colunas

import json

json_object = json.dumps(dict(zip(X.columns.values, np.zeros(X.shape[0]).astype(int).tolist())), indent = 4)

print(json_object)

{
    "Condo": 0,
    "Size": 0,
    "Rooms": 0,
    "Toilets": 0,
    "Suites": 0,
    "Parking": 0,
    "Elevator": 0,
    "Furnished": 0,
    "Swimming Pool": 0,
    "New": 0,
    "Latitude": 0,
    "Longitude": 0,
    "District_Alto de Pinheiros": 0,
    "District_Anhanguera": 0,
    "District_Aricanduva": 0,
    "District_Artur Alvim": 0,
    "District_Barra Funda": 0,
    "District_Bela Vista": 0,
    "District_Bel\u00e9m": 0,
    "District_Bom Retiro": 0,
    "District_Brasil\u00e2ndia": 0,
    "District_Brooklin": 0,
    "District_Br\u00e1s": 0,
    "District_Butant\u00e3": 0,
    "District_Cachoeirinha": 0,
    "District_Cambuci": 0,
    "District_Campo Belo": 0,
    "District_Campo Grande": 0,
    "District_Campo Limpo": 0,
    "District_Canga\u00edba": 0,
    "District_Cap\u00e3o Redondo": 0,
    "District_Carr\u00e3o": 0,
    "District_Casa Verde": 0,
    "District_Cidade Ademar": 0,
    "District_Cidade Dutra": 0,
    "District_Cidade L\u00edder": 0,
    "District_Cidade T