In [1]:
import pandas as pd
import numpy as np

columns_to_use = ['Type', 'Method', 'Regionname', 'Rooms', 'Distance', 
                  'Postcode', 'Bedroom2', 'Bathroom', 'Landsize', 'Lattitude',
                 'Longtitude', 'Propertycount','Price']

df_melbourne = pd.read_csv("../../Kaggle/Melbourne-House-Snapshot/melb_data.csv", usecols=columns_to_use)

In [None]:
df_melbourne.head(n=10)

In [None]:
df_melbourne.info()

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds) #Quanto Menor, Melhor!!!

  from numpy.core.umath_tests import inner1d


### Separa Treino e Teste

In [3]:
from sklearn.model_selection import train_test_split
y = df_melbourne.Price
X = df_melbourne.drop(['Price'], axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

### Recuperando Lista de Variáveis Numéricos

In [4]:
lista = (df_melbourne.dtypes != 'object')
numerical_cols = list(lista[lista].index)

print("Variáveis Numéricos:")
print(numerical_cols)

Variáveis Numéricos:
['Rooms', 'Price', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude', 'Propertycount']


### Recupera colunas categóricas e transforma

In [5]:
#Trata atributos categóricos
from sklearn.preprocessing import LabelEncoder
s = (df_melbourne.dtypes == 'object')
object_cols = list(s[s].index)

print("Variáveis Categóricas:")
print(object_cols)

Variáveis Categóricas:
['Type', 'Method', 'Regionname']


In [6]:
pd.set_option('mode.chained_assignment',None)
label_encoder = LabelEncoder()

for col in object_cols:
    label_encoder.fit(X_train[col])
    X_train[col] = label_encoder.transform(X_train[col])    
    X_valid[col] = label_encoder.transform(X_valid[col])

### (1) Normalização

In [7]:
from sklearn.preprocessing import Normalizer

norm_X_train = X_train.copy()
norm_X_valid = X_valid.copy()

norm_numerical_cols = numerical_cols.copy()
del norm_numerical_cols[1]

for column in norm_numerical_cols:
    transformer = Normalizer()
    
    values = np.array(norm_X_train[column]).reshape(-1,1)
    norm_X_train[column] = transformer.fit_transform(values)
    
    values = np.array(norm_X_valid[column]).reshape(-1,1)
    norm_X_valid[column] = transformer.transform(values)

print("MAE from Approach 1 (Normalização):")
print(score_dataset(norm_X_train, norm_X_valid, y_train, y_valid))

MAE from Approach 1 (Normalização):
323172.6316158428


### (2) Padronização

In [8]:
from sklearn.preprocessing import StandardScaler

std_X_train = X_train.copy()
std_X_valid = X_valid.copy()

std_numerical_cols = numerical_cols.copy()
del std_numerical_cols[1]

for column in std_numerical_cols:
    transformer = StandardScaler()
    
    values = np.array(std_X_train[column]).reshape(-1,1)
    std_X_train[column] = transformer.fit_transform(values)
    
    values = np.array(std_X_valid[column]).reshape(-1,1)
    std_X_valid[column] = transformer.transform(values)

print("MAE from Approach 2 (Padronização):")
print(score_dataset(std_X_train, std_X_valid, y_train, y_valid))



MAE from Approach 2 (Padronização):
165999.9143014938
