In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv("data/melb_data.csv")

y = data.Price
X = data.drop(['Price'],axis=1)

X_train_full,X_valid_full,y_train,y_valid = train_test_split(X,y,train_size=0.8,test_size=0.2,random_state=0)

cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()]

X_train_full.drop(cols_with_missing,axis=1,inplace=True)
X_valid_full.drop(cols_with_missing,axis=1,inplace=True)

#Cardinality

low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]

numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64','float64']]

my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0


In [9]:
# list of all categorical columns

s = (X_train.dtypes == 'object')
objec_cols = list(s[s].index)
print(objec_cols)



['Type', 'Method', 'Regionname']


In [16]:
# calculate MAE

from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

def score_dataset(train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(n_estimators=100,random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [17]:
# Approach - 1 | Drop categorical variables

drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

print("MAE form Approach - 1 | Drop categorical variables")
print(score_dataset(drop_X_train,drop_X_valid,y_train,y_valid))


MAE form Approach - 1 | Drop categorical variables
175703.48185157913


In [18]:
# Approach - 2 | Label Encoding
from sklearn.preprocessing import LabelEncoder

label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

label_encoder = LabelEncoder()

for col in objec_cols:
    label_X_train[col] = label_encoder.fit_transform(X_train[col])
    label_X_valid[col] = label_encoder.transform(X_valid[col])

print("Approach - 2 | Label Encoding")
print(score_dataset(label_X_train,label_X_valid,y_train,y_valid))

Approach - 2 | Label Encoding
165936.40548390493


In [None]:
# Approach - 3 | One-Hot Encoding

from sklearn.preprocessing import OneHotEncoder

OH_encoder = OneHotEncoder(handle_unknown='ignore',sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[objec_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[objec_cols]))

OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

num_X_train = X_train.drop(objec_cols,axis=1)
num_X_valid = X_valid.drop(objec_cols,axis=1)

OH_X_train = pd.concat([OH_cols_train,num_X_train],axis=1)
OH_X_valid = pd.concat([OH_cols_valid,num_X_valid],axis=1)

print("Approach - 3 | One-Hot Encoding")
print(score_dataset(OH_X_train,OH_X_valid,y_train,y_valid))