In [1]:
import pandas as pd
import numpy as np

columns_to_use = ['Type', 'Method', 'Regionname', 'Rooms', 'Distance', 
                  'Postcode', 'Bedroom2', 'Bathroom', 'Landsize', 'Lattitude',
                 'Longtitude', 'Propertycount','Price']

df_melbourne = pd.read_csv("../../Kaggle/Melbourne-House-Snapshot/melb_data.csv", usecols=columns_to_use)

In [2]:
df_melbourne.head(n=10)

Unnamed: 0,Rooms,Type,Price,Method,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Regionname,Propertycount
0,2,h,1480000.0,S,2.5,3067.0,2.0,1.0,202.0,-37.7996,144.9984,Northern Metropolitan,4019.0
1,2,h,1035000.0,S,2.5,3067.0,2.0,1.0,156.0,-37.8079,144.9934,Northern Metropolitan,4019.0
2,3,h,1465000.0,SP,2.5,3067.0,3.0,2.0,134.0,-37.8093,144.9944,Northern Metropolitan,4019.0
3,3,h,850000.0,PI,2.5,3067.0,3.0,2.0,94.0,-37.7969,144.9969,Northern Metropolitan,4019.0
4,4,h,1600000.0,VB,2.5,3067.0,3.0,1.0,120.0,-37.8072,144.9941,Northern Metropolitan,4019.0
5,2,h,941000.0,S,2.5,3067.0,2.0,1.0,181.0,-37.8041,144.9953,Northern Metropolitan,4019.0
6,3,h,1876000.0,S,2.5,3067.0,4.0,2.0,245.0,-37.8024,144.9993,Northern Metropolitan,4019.0
7,2,h,1636000.0,S,2.5,3067.0,2.0,1.0,256.0,-37.806,144.9954,Northern Metropolitan,4019.0
8,1,u,300000.0,S,2.5,3067.0,1.0,1.0,0.0,-37.8008,144.9973,Northern Metropolitan,4019.0
9,2,h,1097000.0,S,2.5,3067.0,3.0,1.0,220.0,-37.801,144.9989,Northern Metropolitan,4019.0


In [3]:
df_melbourne.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 13 columns):
Rooms            13580 non-null int64
Type             13580 non-null object
Price            13580 non-null float64
Method           13580 non-null object
Distance         13580 non-null float64
Postcode         13580 non-null float64
Bedroom2         13580 non-null float64
Bathroom         13580 non-null float64
Landsize         13580 non-null float64
Lattitude        13580 non-null float64
Longtitude       13580 non-null float64
Regionname       13580 non-null object
Propertycount    13580 non-null float64
dtypes: float64(9), int64(1), object(3)
memory usage: 1.3+ MB


In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds) #Quanto Menor, Melhor!!!

  from numpy.core.umath_tests import inner1d


### Separa Treino e Teste

In [5]:
from sklearn.model_selection import train_test_split
y = df_melbourne.Price
X = df_melbourne.drop(['Price'], axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

### Recuperando Lista de Variáveis Numéricos

In [6]:
lista = (df_melbourne.dtypes != 'object')
numerical_cols = list(lista[lista].index)

print("Variáveis Numéricos:")
print(numerical_cols)

Variáveis Numéricos:
['Rooms', 'Price', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude', 'Propertycount']


### Recupera colunas categóricas e transforma

In [7]:
#Trata atributos categóricos
from sklearn.preprocessing import LabelEncoder
s = (df_melbourne.dtypes == 'object')
object_cols = list(s[s].index)

print("Variáveis Categóricas:")
print(object_cols)

Variáveis Categóricas:
['Type', 'Method', 'Regionname']


In [8]:
pd.set_option('mode.chained_assignment',None)
label_encoder = LabelEncoder()

for col in object_cols:
    label_encoder.fit(X_train[col])
    X_train[col] = label_encoder.transform(X_train[col])    
    X_valid[col] = label_encoder.transform(X_valid[col])

### (1) Normalização

In [9]:
from sklearn.preprocessing import Normalizer

norm_X_train = X_train.copy()
norm_X_valid = X_valid.copy()

norm_numerical_cols = numerical_cols.copy()
del norm_numerical_cols[1]

for column in norm_numerical_cols:
    transformer = Normalizer()
    
    values = np.array(norm_X_train[column]).reshape(-1,1)
    norm_X_train[column] = transformer.fit_transform(values)
    
    values = np.array(norm_X_valid[column]).reshape(-1,1)
    norm_X_valid[column] = transformer.transform(values)

print("MAE from Approach 1 (Normalização):")
print(score_dataset(norm_X_train, norm_X_valid, y_train, y_valid))

MAE from Approach 1 (Normalização):
323172.6316158428


### (2) Padronização

In [10]:
from sklearn.preprocessing import StandardScaler

std_X_train = X_train.copy()
std_X_valid = X_valid.copy()

std_numerical_cols = numerical_cols.copy()
del std_numerical_cols[1]

for column in std_numerical_cols:
    transformer = StandardScaler()
    
    values = np.array(std_X_train[column]).reshape(-1,1)
    std_X_train[column] = transformer.fit_transform(values)
    
    values = np.array(std_X_valid[column]).reshape(-1,1)
    std_X_valid[column] = transformer.transform(values)

print("MAE from Approach 2 (Padronização):")
print(score_dataset(std_X_train, std_X_valid, y_train, y_valid))



MAE from Approach 2 (Padronização):
165999.9143014938
