## Categorical Variables
Dataset: Home prices in Melbourne, Australia

In [1]:
import pandas as pd
price_data = pd.read_csv('Datasets/melb_data.csv')

y = price_data.Price
X = price_data.drop('Price', axis=1)

from sklearn.model_selection import train_test_split
X_train_full, X_test_full, y_train, y_test = train_test_split(X, y , train_size=0.8, test_size=0.2)

In [2]:
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()] 
X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_test_full.drop(cols_with_missing, axis=1, inplace=True)

low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_absolute_error(y_test, preds)

In [4]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12808,h,SP,Northern Metropolitan,3,11.2,3046.0,3.0,1.0,327.0,-37.7151,144.92103,2651.0
5553,h,VB,Southern Metropolitan,2,2.1,3205.0,2.0,1.0,185.0,-37.8337,144.9507,5943.0
4256,t,S,Western Metropolitan,3,8.4,3015.0,3.0,2.0,224.0,-37.8424,144.8653,5498.0
1045,h,S,Southern Metropolitan,4,11.2,3186.0,4.0,3.0,742.0,-37.914,144.9878,10579.0
10888,h,SP,Western Metropolitan,4,15.5,3028.0,4.0,2.0,749.0,-37.88413,144.79033,7630.0


In [5]:
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables: ")
print(object_cols)

Categorical variables: 
['Type', 'Method', 'Regionname']


In [6]:
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_test = X_test.select_dtypes(exclude=['object'])

print("MAE from Approach 1 (Drop categorical variables): ")
print(score_dataset(drop_X_train, drop_X_test, y_train, y_test))

MAE from Approach 1 (Drop categorical variables): 
195194.37202687777


In [7]:
from sklearn.preprocessing import LabelEncoder

label_X_train = X_train.copy()
label_X_test = X_test.copy()

label_encoder = LabelEncoder()
for col in object_cols:
    label_X_train[col] = label_encoder.fit_transform(X_train[col])
    label_X_test[col] = label_encoder.transform(X_test[col])

print("MAE from Approach 2 (Label Encoding): ") 
print(score_dataset(label_X_train, label_X_test, y_train, y_test))

MAE from Approach 2 (Label Encoding): 
180697.3003190967


In [8]:
from sklearn.preprocessing import OneHotEncoder

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[object_cols]))

OH_cols_train.index = X_train.index
OH_cols_test.index = X_test.index

num_X_train = X_train.drop(object_cols, axis=1)
num_X_test = X_test.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

print("MAE from Approach 3 (One-Hot Encoding): ") 
print(score_dataset(OH_X_train, OH_X_test, y_train, y_test))

MAE from Approach 3 (One-Hot Encoding): 
180441.00158847042
