In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

data = pd.read_csv("data/melb_data.csv")
y = data.Price

melb_predictors = data.drop(['Price'],axis=1,inplace=True)
X = data.select_dtypes(exclude=['object'])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size= 0.8, test_size= 0.2,random_state=0)


In [33]:
from sklearn.metrics import mean_absolute_error

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor()
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [34]:
#drop colums
cols_with_missing = [col for col in X_train.columns
                    if X_train[col].isnull().any()]


reduced_X_train = X_train.drop(cols_with_missing,axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing,axis=1)
get_mae(10,reduced_X_train,reduced_X_valid,
y_train,y_valid)

175511.9868221825

In [35]:
# imputation

from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print("MAE from imputation")
print(get_mae(10,imputed_X_train,imputed_X_valid,
y_train,y_valid))


MAE from imputation
167761.3676652173


In [36]:
# imputation | extention with missing value column (yes,no)

X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

for col in cols_with_missing:
    X_train_plus[col + "_was_missing"] = X_train_plus[col].isnull()
    X_valid_plus[col + "_was_missing"] = X_valid_plus[col].isnull()

my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns

print("MAE from imputation extended")
print(get_mae(10,imputed_X_train_plus,imputed_X_valid_plus,
y_train,y_valid))

MAE from imputation extended
167141.942883968
