In [19]:
import pandas as pd

In [20]:
melbourne_data = pd.read_csv("melb_data.csv")

In [21]:
y = melbourne_data.Price # our target - what we try to predict
x = melbourne_data.drop(['Price'], axis=1) # all features except price (our predictor)

In [22]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=18)

In [23]:
# cardinality means number of unique values in a column
low_cardinality_columns = []
for column in x_train.columns:
    if x_train[column].nunique() < 10 and x_train[column].dtype == "object":
        low_cardinality_columns.append(column)
# adds columns with unique number values less than 10

In [24]:
numerical_columns = []
for column in x_train.columns:
    if x_train[column].dtype in ['float64', 'int64']:
        numerical_columns.append(column)

In [25]:
# combine only these two types of data in our data
data = low_cardinality_columns + numerical_columns
x_train = x_train[data].copy()
x_test = x_test[data].copy()

In [26]:
x_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
2454,u,SP,Western Metropolitan,2,8.0,3040.0,2.0,1.0,1.0,170.0,88.0,1994.0,-37.7475,144.8959,9264.0
13030,h,S,Eastern Victoria,3,31.6,3796.0,3.0,1.0,0.0,915.0,101.0,1970.0,-37.78495,145.3919,3532.0
5490,h,PI,Western Metropolitan,3,6.6,3011.0,3.0,3.0,2.0,227.0,211.0,2015.0,-37.8033,144.8897,2417.0
2796,h,PI,Southern Metropolitan,5,9.2,3146.0,5.0,3.0,1.0,696.0,248.0,1920.0,-37.8563,145.0709,10412.0
12271,h,S,Eastern Metropolitan,3,23.0,3136.0,3.0,2.0,1.0,1007.0,250.0,1980.0,-37.76653,145.29163,2985.0


It can be seen there exists both missing values and categorical data, lets use pipelines to effectively process this.

In [27]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [28]:
# Pre-processing for numerical data
numerical_processor = SimpleImputer(strategy='constant')

# Pre-processing for categorical data
categorical_processor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
# Bundle the pre-processing steps for numerical and categorical
pre_processor = ColumnTransformer(
    transformers=[
        ('num', numerical_processor, numerical_columns),
        ('cat', categorical_processor, low_cardinality_columns)
    ]
)

In [29]:
from sklearn.ensemble import RandomForestRegressor

RF_Model = RandomForestRegressor(n_estimators=100, random_state=18) # Created Random Forest Model

In [30]:
from sklearn.metrics import mean_absolute_error
# Bundles the pre-processing code and modelling code into a pipeline
pre_processing_pipeline = Pipeline(steps=[
    ('preprocessor', pre_processor),
    ('model', RF_Model)
])

# pre-processes the training data and fits to a model
pre_processing_pipeline.fit(x_train, y_train)
prediction = pre_processing_pipeline.predict(x_test)
MAE_score = mean_absolute_error(y_test, prediction)

print(f"The Mean Absolute Error for the testing data is {MAE_score}")

The Mean Absolute Error for the testing data is 164637.00742688827
