# Testando o modelo

Notebook para testar o modelo escolhido no dataset de teste.

## Bibliotecas

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from category_encoders import BinaryEncoder, CountEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score

import xgboost as xgb

import pickle

## Carregando dataset de teste

In [4]:
# Conectar ao banco de dados SQLite
conn = sqlite3.connect('data/house_prices.db')

# Ler a tabela 'df_test' para um DataFrame
df_test = pd.read_sql_query('SELECT * FROM df_test', conn)

# Fechar a conexão
conn.close()

# Exibir as primeiras linhas do DataFrame para verificar
df_test.head()

Unnamed: 0,Type,Region,MunicipalityCode,Prefecture,Municipality,DistrictName,NearestStation,TimeToNearestStation,MinTimeToNearestStation,MaxTimeToNearestStation,...,Breadth,CityPlanning,CoverageRatio,FloorAreaRatio,Period,Year,Quarter,Renovation,Remarks,TradePrice
0,Pre-owned Condominiums etc.,,13103,Tokyo,Minato Ward,Toranomon,Kamiyacho,4,4,4,...,,Commercial Zone,80,500,3rd quarter 2016,2016,3,Not yet,,
1,Pre-owned Condominiums etc.,,13110,Tokyo,Meguro Ward,Higashiyama,Ikejiriohashi,7,7,7,...,,Category I Residential Zone,60,300,3rd quarter 2012,2012,3,,,
2,Pre-owned Condominiums etc.,,13112,Tokyo,Setagaya Ward,Kitakarasuyama,Chitosekarasuyama,25,25,25,...,,Category I Exclusively Low-story Residential Zone,50,100,4th quarter 2015,2015,4,Done,,
3,Pre-owned Condominiums etc.,,13121,Tokyo,Adachi Ward,Ayase,Ayase,4,4,4,...,,Commercial Zone,80,500,2nd quarter 2017,2017,2,Done,,
4,Residential Land(Land and Building),Residential Area,13107,Tokyo,Sumida Ward,Honjo,Honjoazumabashi,7,7,7,...,6.0,Neighborhood Commercial Zone,80,300,3rd quarter 2016,2016,3,,,


## Preparando dataset

- Selecionar colunas
- Ajustar tipos
- Imputar nulos

### Selecionando colunas

In [5]:
# Selecionando as mesmas colunas do df de treino

cols = ['MunicipalityCode', 'MaxTimeToNearestStation', 'Area', 'Frontage',
       'BuildingYear', 'Breadth', 'CoverageRatio', 'FloorAreaRatio', 'Year',
       'Quarter', 'Type', 'Region', 'NearestStation', 'LandShape', 'Structure',
       'Use', 'Direction', 'Classification', 'CityPlanning', 'TradePrice']

df_test = df_test[cols]
df_test.columns

Index(['MunicipalityCode', 'MaxTimeToNearestStation', 'Area', 'Frontage',
       'BuildingYear', 'Breadth', 'CoverageRatio', 'FloorAreaRatio', 'Year',
       'Quarter', 'Type', 'Region', 'NearestStation', 'LandShape', 'Structure',
       'Use', 'Direction', 'Classification', 'CityPlanning', 'TradePrice'],
      dtype='object')

### Ajustando tipos

In [19]:
# Substituir valores vazios por NaN
df_test.replace('', np.nan, inplace=True)

# Ajustar os tipos das colunas conforme df_train
df_test = df_test.astype({
    'MunicipalityCode': 'float64',
    'MaxTimeToNearestStation': 'float64',
    'Area': 'float64',
    'Frontage': 'float64',
    'BuildingYear': 'float64',
    'Breadth': 'float64',
    'CoverageRatio': 'float64',
    'FloorAreaRatio': 'float64',
    'Year': 'float64',
    'Quarter': 'float64',
    'Type': 'object',
    'Region': 'object',
    'NearestStation': 'object',
    'LandShape': 'object',
    'Structure': 'object',
    'Use': 'object',
    'Direction': 'object',
    'Classification': 'object',
    'CityPlanning': 'object',
    'TradePrice': 'float64'
})

# Verificar os tipos ajustados
print(df_test.dtypes)

MunicipalityCode           float64
MaxTimeToNearestStation    float64
Area                       float64
Frontage                   float64
BuildingYear               float64
Breadth                    float64
CoverageRatio              float64
FloorAreaRatio             float64
Year                       float64
Quarter                    float64
Type                        object
Region                      object
NearestStation              object
LandShape                   object
Structure                   object
Use                         object
Direction                   object
Classification              object
CityPlanning                object
TradePrice                 float64
dtype: object


  df_test.replace('', np.nan, inplace=True)


### Tratando nulos

In [23]:
# Função para calcular porcentagem de valores nulos por coluna
def calculate_null_percentages(df):
    # Calcular porcentagem de valores nulos
    null_percentages = df.isnull().mean() * 100
    
    # Criar DataFrame para exibir as porcentagens
    percentages_df = pd.DataFrame({
        'Null Percentage': null_percentages
    })
    
    # Ordenar o DataFrame em ordem decrescente
    percentages_df = percentages_df.sort_values(by='Null Percentage', ascending=False)

    return percentages_df

# Aplicar a função ao DataFrame df_test
percentages_df = calculate_null_percentages(df_test)

# Exibir o resultado
print(percentages_df)

                         Null Percentage
TradePrice                    100.000000
Frontage                       49.895467
Breadth                        46.336424
Classification                 45.983471
Direction                      45.388248
LandShape                      45.379639
Region                         45.276336
Use                            24.180338
BuildingYear                   22.789434
Structure                      21.777308
MaxTimeToNearestStation         2.592420
CoverageRatio                   1.552008
FloorAreaRatio                  1.552008
CityPlanning                    0.991219
NearestStation                  0.468554
MunicipalityCode                0.000000
Quarter                         0.000000
Year                            0.000000
Area                            0.000000
Type                            0.000000


Substituindo valores nulos com o método MICE:

In [27]:
df_test.dtypes

MunicipalityCode           float64
MaxTimeToNearestStation    float64
Area                       float64
Frontage                   float64
BuildingYear               float64
Breadth                    float64
CoverageRatio              float64
FloorAreaRatio             float64
Year                       float64
Quarter                    float64
Type                        object
Region                      object
NearestStation              object
LandShape                   object
Structure                   object
Use                         object
Direction                   object
Classification              object
CityPlanning                object
TradePrice                 float64
dtype: object

In [28]:
from fancyimpute import IterativeImputer
from sklearn.impute import SimpleImputer

# Excluir a coluna 'TradePrice' do DataFrame porque ela não será imputada
df1 = df_test.drop(columns=['TradePrice'])

# Identificar colunas numéricas e categóricas
numeric_columns = df1.select_dtypes(include=['number']).columns.tolist()
categorical_columns = df1.select_dtypes(include=['object', 'category']).columns.tolist()

# Separar colunas numéricas e categóricas
df_numeric = df1[numeric_columns]
df_categorical = df1[categorical_columns]

# Imputar valores numéricos usando MICE
mice_imputer = IterativeImputer()
df_numeric_imputed = pd.DataFrame(mice_imputer.fit_transform(df_numeric), columns=numeric_columns)

# Imputar valores categóricos usando imputação pela moda (valor mais frequente)
mode_imputer = SimpleImputer(strategy='most_frequent')
df_categorical_imputed = pd.DataFrame(mode_imputer.fit_transform(df_categorical), columns=categorical_columns)

# Substituir as colunas originais no DataFrame com as colunas imputadas
df1[numeric_columns] = df_numeric_imputed
df1[categorical_columns] = df_categorical_imputed

# Verificar se a imputação foi bem-sucedida
print(df1.isnull().sum())

MunicipalityCode           0
MaxTimeToNearestStation    0
Area                       0
Frontage                   0
BuildingYear               0
Breadth                    0
CoverageRatio              0
FloorAreaRatio             0
Year                       0
Quarter                    0
Type                       0
Region                     0
NearestStation             0
LandShape                  0
Structure                  0
Use                        0
Direction                  0
Classification             0
CityPlanning               0
dtype: int64


## Carregando modelo

In [6]:
# Carregar o pipeline completo de um arquivo pickle
with open('models/pipeline_xgb.pkl', 'rb') as file:
    loaded_pipeline_xgb = pickle.load(file)


df_test não tem valores na coluna TradePrice, por isso não faz sentido transformá-la em log aqui.

**MAS É MUITO IMPORTANTE COMPARAR AS PREVISÕES DO MODELO, DADAS EM LOG, COM OS VALORES REAIS TRANSFORMADOS EM LOG**

In [None]:
# Aplicar transformação log na coluna TradePrice e excluir a coluna original
# df_test['TradePrice_log'] = np.log1p(df_test['TradePrice'])
# df_test = df_test.drop(columns=['TradePrice'])

## Fazendo previsões

In [29]:
# Fazer previsões usando o pipeline carregado
predictions = loaded_pipeline_xgb.predict(df_test)

# Exibir as primeiras previsões para verificar
print(predictions[:10])

[17.160608 16.31718  17.493427 16.71166  17.301353 17.560663 18.114243
 19.136343 17.489992 17.10629 ]


## Gravando previsões no dataset de teste

In [31]:
# Adicionar previsões ao DataFrame
df_test['TradePrices_log'] = predictions

# Verificar se a coluna foi adicionada corretamente
print(df_test[['TradePrices_log']].head())

   TradePrices_log
0        17.160608
1        16.317181
2        17.493427
3        16.711660
4        17.301353


In [32]:
# salvar dataset de teste
df_test.to_csv('data/df_test_final.csv', index=False)