In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import KFold
from scipy.stats import pearsonr
from statistics import mean

In [2]:
# Importação dos conjutos de amostras
dados_treino = pd.read_csv("conjunto_de_treinamento.csv", delimiter=",", decimal=".")
dados_teste = pd.read_csv("conjunto_de_teste.csv", delimiter=",", decimal=".")

In [3]:
dados_treino

Unnamed: 0,Id,tipo,bairro,tipo_vendedor,quartos,suites,vagas,area_util,area_extra,diferenciais,...,estacionamento,piscina,playground,quadra,s_festas,s_jogos,s_ginastica,sauna,vista_mar,preco
0,2000,Casa,Imbiribeira,Imobiliaria,3,3,5,223,167,piscina e copa,...,0,1,0,0,0,0,0,0,0,1000000.0
1,2001,Apartamento,Casa Amarela,Imobiliaria,4,4,2,157,0,piscina e churrasqueira,...,0,1,0,0,0,0,0,0,0,680000.0
2,2002,Apartamento,Encruzilhada,Imobiliaria,3,1,0,53,0,nenhum,...,0,0,0,0,0,0,0,0,0,450000.0
3,2003,Apartamento,Boa Viagem,Imobiliaria,4,3,2,149,0,piscina e churrasqueira,...,0,1,0,0,0,0,0,0,0,1080000.0
4,2004,Apartamento,Rosarinho,Imobiliaria,2,1,1,54,0,piscina e churrasqueira,...,0,1,0,0,0,0,0,0,0,350000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4678,6678,Apartamento,Boa Viagem,Imobiliaria,4,4,3,170,0,piscina e churrasqueira,...,0,1,0,0,0,0,0,0,0,1200000.0
4679,6679,Apartamento,Setubal,Imobiliaria,2,0,1,82,0,churrasqueira,...,0,0,0,0,0,0,0,0,0,258000.0
4680,6680,Apartamento,Torre,Imobiliaria,3,1,1,75,0,nenhum,...,0,0,0,0,0,0,0,0,0,252000.0
4681,6681,Apartamento,Casa Amarela,Imobiliaria,3,2,3,136,0,piscina e copa,...,0,1,0,0,0,0,0,0,0,780000.0


In [4]:
dados_teste

Unnamed: 0,Id,tipo,bairro,tipo_vendedor,quartos,suites,vagas,area_util,area_extra,diferenciais,churrasqueira,estacionamento,piscina,playground,quadra,s_festas,s_jogos,s_ginastica,sauna,vista_mar
0,0,Apartamento,Pina,Imobiliaria,4,4,3,182,0,copa e playground,0,0,0,1,0,0,0,0,0,0
1,1,Apartamento,Tamarineira,Imobiliaria,2,0,1,85,0,nenhum,0,0,0,0,0,0,0,0,0,0
2,2,Apartamento,Boa Viagem,Imobiliaria,3,1,2,115,20,piscina e churrasqueira,1,0,1,0,0,0,0,0,0,0
3,3,Apartamento,Iputinga,Imobiliaria,3,0,1,92,0,nenhum,0,0,0,0,0,0,0,0,0,0
4,4,Apartamento,Engenho do Meio,Imobiliaria,3,1,1,65,0,piscina e copa,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1995,Apartamento,Poco da Panela,Imobiliaria,2,1,2,100,0,nenhum,0,0,0,0,0,0,0,0,0,0
1996,1996,Apartamento,Madalena,Imobiliaria,3,1,1,90,0,nenhum,0,0,0,0,0,0,0,0,0,0
1997,1997,Apartamento,Boa Viagem,Imobiliaria,3,1,1,73,0,piscina e salao de festas,0,0,1,0,0,1,0,0,0,0
1998,1998,Apartamento,Torre,Imobiliaria,4,3,2,112,2810,piscina e frente para o mar,0,0,1,0,0,0,0,0,0,1


# Remoção dos outliers

Como discutido em sala de aula e por recomendação do professor, remove-se os outliers do conjunto de treino.

In [5]:
dados_treino = dados_treino[(dados_treino['preco'] > 50000)&(dados_treino['preco'] < 10000000)]

# Análise dos gráficos

In [6]:
#for coluna in dados_treino.columns:
#     dados_treino.plot.scatter(x=coluna, y='preco')

In [7]:
colunas_com_outliers = ['area_util', 'area_extra', 'vagas']

# Tratamento dos dados

## Remoção e separação de colunas

In [8]:
# id - remover
# tipo, tipo_vendedor - get_dummies
# bairro - transformar em preço_medio do bairro > x vezes a media dos imoveis
# vagas - tirar outliers
# area_util - tirar outliers
# area_extra - tirar outliers
# diferenciais - remover 

dados_treino = dados_treino.drop(columns=['Id', 'diferenciais'])
dados_teste = dados_teste.drop(columns=['Id', 'diferenciais'])

In [9]:
media_imoveis = dados_treino['preco'].mean()

def classifica_bairros(df):
    classificacao_bairros = {}

    for bairro in df['bairro']:
        media_preco = dados_treino[dados_treino['bairro'] == bairro]['preco'].mean()

        if media_preco <= media_imoveis/6:
            classificacao_bairros[bairro] = 'pobre'
        elif media_preco <= media_imoveis/3:
            classificacao_bairros[bairro] = 'mediano'
        elif media_preco <= media_imoveis:
            classificacao_bairros[bairro] = 'normal'
        elif media_preco >= media_imoveis*3:
            classificacao_bairros[bairro] = '5_estrelas'
        elif media_preco >= media_imoveis*1.5:
            classificacao_bairros[bairro] = 'luxo'
        elif media_preco >= media_imoveis:
            classificacao_bairros[bairro] = 'bom'

    return classificacao_bairros

dados_teste['bairro'] = dados_teste['bairro'].map(classifica_bairros(dados_teste))
dados_treino['bairro'] = dados_treino['bairro'].map(classifica_bairros(dados_treino))

## Substituindo valores NaN

In [10]:
print("Dados NULL por coluna no conjunto de treino")
print(dados_treino.isnull().sum(), end='\n\n')
print("Dados NULL por coluna no conjunto de teste")
print(dados_teste.isnull().sum())

Dados NULL por coluna no conjunto de treino
tipo              0
bairro            0
tipo_vendedor     0
quartos           0
suites            0
vagas             0
area_util         0
area_extra        0
churrasqueira     0
estacionamento    0
piscina           0
playground        0
quadra            0
s_festas          0
s_jogos           0
s_ginastica       0
sauna             0
vista_mar         0
preco             0
dtype: int64

Dados NULL por coluna no conjunto de teste
tipo              0
bairro            4
tipo_vendedor     0
quartos           0
suites            0
vagas             0
area_util         0
area_extra        0
churrasqueira     0
estacionamento    0
piscina           0
playground        0
quadra            0
s_festas          0
s_jogos           0
s_ginastica       0
sauna             0
vista_mar         0
dtype: int64


## Splitando variáveis categóricas

In [11]:
dados_teste[['bairro']] = dados_teste[['bairro']].fillna('normal')

In [12]:
valores_a_trocar = ['tipo', 'tipo_vendedor', 'bairro']

dados_treino = pd.get_dummies(dados_treino, columns=valores_a_trocar)
dados_teste = pd.get_dummies(dados_teste, columns=valores_a_trocar)

In [13]:
dados_teste['tipo_Quitinete'] = [0 for indice in range(len(dados_teste))] # cria uma coluna em dados_teste que não existia

# Removendo Outliers

In [14]:
dados_treino = dados_treino.astype(float)
dados_teste = dados_teste.astype(float)

In [15]:
# avaliar apenas em colunas com falsos 
for coluna in colunas_com_outliers:
    Q1 = np.percentile(dados_treino[coluna], 25,
                   method = 'midpoint')
 
    Q3 = np.percentile(dados_treino[coluna], 75,
                   method = 'midpoint')
    IQR = Q3 - Q1
 
    max_ = Q3+1.5*IQR
    min_ = Q1-1.5*IQR
    
 
    dados_treino.loc[dados_treino[coluna] < min_, coluna] = np.nan
    dados_treino.loc[dados_treino[coluna] > max_, coluna] = np.nan

In [16]:
dados_treino.isnull().sum()

quartos                          0
suites                           0
vagas                          191
area_util                      245
area_extra                     549
churrasqueira                    0
estacionamento                   0
piscina                          0
playground                       0
quadra                           0
s_festas                         0
s_jogos                          0
s_ginastica                      0
sauna                            0
vista_mar                        0
preco                            0
tipo_Apartamento                 0
tipo_Casa                        0
tipo_Loft                        0
tipo_Quitinete                   0
tipo_vendedor_Imobiliaria        0
tipo_vendedor_Pessoa Fisica      0
bairro_bom                       0
bairro_luxo                      0
bairro_mediano                   0
bairro_normal                    0
dtype: int64

In [17]:
for coluna in colunas_com_outliers:
    dados_treino[[coluna]] = dados_treino[[coluna]].fillna(dados_treino[coluna].median())

In [18]:
dados_treino

Unnamed: 0,quartos,suites,vagas,area_util,area_extra,churrasqueira,estacionamento,piscina,playground,quadra,...,tipo_Apartamento,tipo_Casa,tipo_Loft,tipo_Quitinete,tipo_vendedor_Imobiliaria,tipo_vendedor_Pessoa Fisica,bairro_bom,bairro_luxo,bairro_mediano,bairro_normal
0,3.0,3.0,1.0,223.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,4.0,4.0,2.0,157.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,3.0,1.0,0.0,53.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,4.0,3.0,2.0,149.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,2.0,1.0,1.0,54.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4678,4.0,4.0,3.0,170.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4679,2.0,0.0,1.0,82.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4680,3.0,1.0,1.0,75.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4681,3.0,2.0,3.0,136.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


## Análise da correlação das variáveis

In [19]:
# calcula o coeficiente de Pearson para cada coluna em relação ao alvo
pearson_coef = {}

for coluna in dados_treino.columns:
    coef = round(abs(pearsonr(dados_treino[coluna], dados_treino['preco'])[0]),4)
    if type(coef) not in [float, int] or coluna == 'preco':
        pearson_coef[coluna] = coef
        
from operator import itemgetter

coef_ordenados = sorted(pearson_coef.items(), key=itemgetter(1))

coef_ordenados = [[tupla[0], tupla[1]] for tupla in coef_ordenados]

coef_ordenados = sorted(coef_ordenados, key=itemgetter(1))
coef_ordenados



[['quadra', 0.0027],
 ['s_ginastica', 0.003],
 ['s_jogos', 0.0137],
 ['tipo_Loft', 0.0153],
 ['tipo_Quitinete', 0.0182],
 ['bairro_mediano', 0.0282],
 ['tipo_vendedor_Imobiliaria', 0.0297],
 ['tipo_vendedor_Pessoa Fisica', 0.0297],
 ['estacionamento', 0.0452],
 ['churrasqueira', 0.046],
 ['playground', 0.0538],
 ['s_festas', 0.0695],
 ['piscina', 0.0778],
 ['tipo_Apartamento', 0.1032],
 ['tipo_Casa', 0.1086],
 ['sauna', 0.1377],
 ['vista_mar', 0.1919],
 ['bairro_luxo', 0.2119],
 ['bairro_bom', 0.2293],
 ['bairro_normal', 0.3366],
 ['vagas', 0.3792],
 ['area_util', 0.4852],
 ['quartos', 0.564],
 ['suites', 0.6867],
 ['preco', 1.0]]

In [20]:
indice_aceitavel = 0 # todas as variaveis que possuem Pearson não nulo
variaveis_escolhidas = []

for item in coef_ordenados[indice_aceitavel:]:
    variaveis_escolhidas.append(item[0])

dados_treino = dados_treino[variaveis_escolhidas]
dados_teste = dados_teste[variaveis_escolhidas[:-1]]

## Separação dos dados

In [21]:
#separacao do conjunto de treino em alvo e features e em subconjunto de teste e treino
X = dados_treino.drop(columns=['preco'])
Y = np.array(dados_treino['preco'])

## Escalando os dados

In [22]:
standard_scaler =  StandardScaler()

X = standard_scaler.fit_transform(X)
dados_teste = standard_scaler.transform(dados_teste)

# Escolhendo variáveis e criando os modelos preditivos

In [23]:
kf = KFold(n_splits=5, shuffle=True, random_state=1456) # cria instancia do CV

def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square(((y_true - y_pred) / y_true))))

## Criando KNN

In [24]:
print("  K   |    RMSPE   ")
for numero_kneighbors in range(4, 9, 1):
    regressorKNN = KNeighborsRegressor(n_neighbors=numero_kneighbors, weights='uniform')
    
    sfs = SequentialFeatureSelector(regressorKNN, n_features_to_select=10)
    sfs.fit(X, Y)
    
    erro_percentual_medio = []
    for indice_treino, indice_teste in kf.split(X):
        regressorKNN.fit(sfs.transform(X[indice_treino]), Y[indice_treino])

        predicao = regressorKNN.predict(sfs.transform(X[indice_teste]))
        erro_percentual_medio.append(rmspe(predicao, Y[indice_teste]))
    
    print(  "%d    " %numero_kneighbors, end='')
    print(f"    {round(mean(erro_percentual_medio), 4)}")

  K   |    RMSPE   
4        0.3539
5        0.3477
6        0.345
7        0.347
8        0.3542


## Criando Random Forest

In [25]:
print("  N   |   RMSPE  ")
for numero in range(20, 25, 1):
    regressorRF = RandomForestRegressor(n_estimators=numero)
     
    sfs = SequentialFeatureSelector(regressorRF, n_features_to_select=10)
    sfs.fit(X, Y)
    
    erro_percentual_medio = []
    for indice_treino, indice_teste in kf.split(X):
        regressorRF.fit(sfs.transform(X[indice_treino]), Y[indice_treino])

        predicao = regressorRF.predict(sfs.transform(X[indice_teste]))
        erro_percentual_medio.append(rmspe(predicao, Y[indice_teste]))
    
    print("%d    " %numero, end='')
    print(f"  {round(mean(erro_percentual_medio), 4)}")

  N   |   RMSPE  
20      0.3187
21      0.3198
22      0.3258
23      0.3286
24      0.3179


## Criando modelo final

In [55]:
class Regressor():
    def __init__(self):
        self.RFC = RandomForestRegressor(n_estimators=24)
        self.KNN = KNeighborsRegressor(n_neighbors=6, weights='uniform')
        self.modelos = [self.KNN, self.RFC]
    def fit(self, X, Y):
        self.rmspe_modelos = []
        for indice, modelo in enumerate(self.modelos):
            self.sfs = SequentialFeatureSelector(modelo, n_features_to_select='auto', tol=None)
            self.sfs.fit(X, Y)
            self.modelos[indice] = modelo.fit(self.sfs.transform(X), Y)
            
            erro_percentual_medio = []
            for indice_treino, indice_teste in kf.split(X):
                modelo.fit(sfs.transform(X[indice_treino]), Y[indice_treino])

                predicao_teste = modelo.predict(self.sfs.transform(X[indice_teste]))
                predicao_treino = modelo.predict(self.sfs.transform(X[indice_treino]))
                erro_percentual_medio.append(rmspe(predicao_teste, Y[indice_teste]))
            self.rmspe_modelos.append(mean(erro_percentual_medio))
    def predict(self, X_alvo):
        predicao_modelos = [modelo.predict(self.sfs.transform(X_alvo)) for modelo in self.modelos]
        
        peso_KNN = 1/self.rmspe_modelos[0]
        peso_RFC = 1/self.rmspe_modelos[1]
        
        return [ (predicao_modelos[0][indice]*peso_KNN + predicao_modelos[1][indice]*peso_RFC)/(peso_RFC+peso_KNN) \
                for indice in range(len(X_alvo))]
        
            

In [56]:
regressor = Regressor()

rmspe_teste = []
rmspe_treino = []

## Avaliação do modelo

In [57]:
# faz a CV do modelo final
print("  TESTE  |  TREINO")
for indice_treino, indice_teste in kf.split(X):
    regressor.fit(X[indice_treino], Y[indice_treino])
    yteste = regressor.predict(X[indice_teste])
    ytreino = regressor.predict(X[indice_treino])
    
    score_teste = round(rmspe(yteste, Y[indice_teste]), 4)
    score_treino = round(rmspe(ytreino, Y[indice_treino]) ,4)
    
    rmspe_teste.append(score_teste)
    rmspe_treino.append(score_treino)
    
    print(f"   {score_teste}     ", end='')
    print(score_treino)

print("-------------------")
print("  TESTE  |  TREINO")
print(f"   {round(mean(rmspe_teste), 4)}     ", end='')
print(round(mean(rmspe_treino), 4))

  TESTE  |  TREINO
   0.3451549855077387     0.15411116072185632
   0.28823221794251047     0.1603065887787512
   0.2712756873512424     0.15123138463344257
   0.42389008395884176     0.1517971274502535
   0.2898282777072537     0.15197097735363055
-------------------
  TESTE  |  TREINO
   0.3237     0.1539


# Geração do arquivo de respostas

In [58]:
regressor.fit(X, Y)

x_resposta = dados_teste

y_resposta = regressor.predict(x_resposta)
#y_resposta = list(map(float, y_resposta)) 
y_resposta

array([2017510.58333333,  269166.66666667,  550312.5       , ...,
        270660.21825397,  776677.17013889,  322434.31597222])

In [59]:
id_solicitante = [x for x in range(2000)]

dados_resposta = pd.DataFrame(list(zip(id_solicitante, y_resposta)), columns=['Id', 'preco'])

In [60]:
dados_resposta.to_csv("arquivo_resposta.csv", index=False)