# Basic problem setup

In [6]:
import pandas as pd

# Load data
melb_data = pd.read_csv("../input/melb_data/melb_data.csv")

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

melb_target = melb_data.Price
melb_predictors = melb_data.drop(['Price'], axis=1)

# For the sake of keeping the example simple, we'll only use the numeric predictors
melb_numeric_predictors = melb_predictors.select_dtypes(exclude=['object'])

In [7]:
melb_numeric_predictors.shape

(18396, 13)

In [33]:
melb_numeric_predictors.sample(6)

Unnamed: 0.1,Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
4106,5342,2,4.6,3122.0,2.0,1.0,1.0,,,,-37.826,145.0349,11308.0
14629,18763,2,4.3,3032.0,2.0,1.0,1.0,16000.0,68.5,2012.0,-37.78466,144.93829,1052.0
15160,19441,3,19.6,3076.0,3.0,2.0,2.0,321.0,130.0,2011.0,-37.63169,145.03135,10926.0
5478,7138,3,11.2,3042.0,3.0,2.0,2.0,274.0,,,-37.7353,144.8907,2291.0
988,1280,3,13.0,3204.0,3.0,1.0,2.0,382.0,,,-37.9187,145.0461,6795.0
13189,16975,3,11.2,3046.0,,,,,,,,,8870.0


In [22]:
melb_numeric_predictors.isnull().sum()

Unnamed: 0           0
Rooms                0
Distance             1
Postcode             1
Bedroom2          3469
Bathroom          3471
Car               3576
Landsize          4793
BuildingArea     10634
YearBuilt         9438
Lattitude         3332
Longtitude        3332
Propertycount        1
dtype: int64

In [35]:
melb_numeric_predictors.isnull().any()

Unnamed: 0       False
Rooms            False
Distance          True
Postcode          True
Bedroom2          True
Bathroom          True
Car               True
Landsize          True
BuildingArea      True
YearBuilt         True
Lattitude         True
Longtitude        True
Propertycount     True
dtype: bool

# Create function to Measure Quality of an Approach

In [9]:
X_train, X_test, y_train, y_test = train_test_split(melb_numeric_predictors,
                                                    melb_target, train_size = 0.7, test_size=0.3, random_state=0)

def score_dataset(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_absolute_error(y_test, preds)

# Get Model Score from Dropping Columns with Missing values

In [37]:
cols_with_missing_values = [col for col in X_train.columns if X_train[col].isnull().any()]

reduced_X_train = X_train.drop(cols_with_missing_values, axis=1)
reduced_X_test = X_test.drop(cols_with_missing_values, axis=1)
print "Mean absolute error from dropping columns with Missing values: "
print score_dataset(reduced_X_train, reduced_X_test, y_train, y_test)

Mean absolute error from dropping columns with Missing values: 
347681.370538141


# Get Model Score from imputation

In [39]:
from sklearn.preprocessing import Imputer

my_imputer = Imputer()
imputed_X_train = my_imputer.fit_transform(X_train)
imputed_X_test = my_imputer.transform(X_test)
print("Mean absolute error from Imputation: ")
print score_dataset(imputed_X_train, imputed_X_test, y_train, y_test)

Mean absolute error from Imputation: 
200830.7151114332


# Get Score from imputation with extra column showing what was Imputed

In [41]:
imputed_X_train_plus = X_train.copy()
imputed_X_test_plus = X_test.copy()

cols_with_missing_values = [col for col in X_train if X_train[col].isnull().any()]

for col in cols_with_missing_values:
    imputed_X_train_plus[col + '_was_missing'] = imputed_X_train_plus[col].isnull()
    imputed_X_test_plus[col + '_was_missing'] = imputed_X_test_plus[col].isnull()
    
my_imputer = Imputer()
imputed_X_train_plus = my_imputer.fit_transform(imputed_X_train_plus)
imputed_X_test_plus = my_imputer.transform(imputed_X_test_plus)
print "Mean absolute error from Imputation while Track what was Imputed: "
print score_dataset(imputed_X_train_plus, imputed_X_test_plus, y_train, y_test)

Mean absolute error from Imputation while Track what was Imputed: 
200779.04587787643
