## Predicting Iowa Housing Prices Using the Random Forests Algorithm

In [2]:
import pandas as pd
import pprint
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor


# Path of the file to read
iowa_file_path = 'home-data/train.csv'
home_data = pd.read_csv(iowa_file_path)

### Let's create a target object and call it 'y' and X that contains the features that we want to consider in building our model.

In [3]:
# Create target object and call it y
y = home_data.SalePrice

# Create X
features = ['LotArea', 'YearRemodAdd','YearBuilt','YrSold','GarageCars','KitchenAbvGr','1stFlrSF','2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = home_data[features]

### Let's split our dataset into validation and training data using train_test_split. Afterwards, we can now build our model by specifiying what kind of ML algorithm we want to employ. For this case, we will use DecisionTreeRegressor to build our model based on the Decision Tree learning method.

### After building, we will feed our training dataset to the model.

In [13]:
# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)


# Random forests model
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
iowa_predict = forest_model.predict(val_X)
rf_val_mae = mean_absolute_error(val_y, iowa_predict)
print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))


DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

### Now we can validate our model by generating the predicting values and determining the mean_absolute_error

In [14]:
iowa_predict = forest_model.predict(val_X)
rf_val_mae = mean_absolute_error(val_y, iowa_predict)
print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))


 
Validation MAE (in dollars): 29,653


### We can still further optimize this model by finding the ideal max_leaf_nodes. But first, let's create a function to get the Mean Absolute Error to make things more efficient.

In [15]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes,random_state=0)
    model.fit(train_X, train_y)
    predicted_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, predicted_val)
    return mae

In [16]:
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes

# explicit loop
scores = {max_leaf_nodes: get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y) for max_leaf_nodes in candidate_max_leaf_nodes}
best_tree_size = min(scores, key=scores.get)

pprint.pprint(scores)
print('\n The optimal number of max_leaf_nodes is: ',best_tree_size)

{5: 35044.51299744237,
 25: 29016.41319191076,
 50: 27405.930473214907,
 100: 27282.50803885739,
 250: 27893.822225701646,
 500: 29454.18598068598}

 The optimal number of max_leaf_nodes is:  100


### Now that we've determined the optimal number of max_leaf_nodes, we can fine tune our model.

In [17]:
# Fine tune the model

final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)

# fit the final model and uncomment the next two lines
final_model.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=100, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')