# Predicting House Prices - Sample Exercise (from Kaggle)

In [52]:
import pandas as pd

# the file train.csv contains house data from iowa
iowa_file_path = "C:\\Users\\buchh\\OneDrive\\Desktop\\Untitled Folder\\resource\\train.csv"
home_data = pd.read_csv(iowa_file_path)

## Task: Predict housing price

### Basic Pandas function to review the data

In [None]:
#print(home_data.head())
#home_data.describe()
#home_data.columns

### Finding features

In [54]:
feature_names = ['Id', 'LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath',
                 'BedroomAbvGr', 'TotRmsAbvGrd']

# above features are some of the attributes that will be considered
# in determining the price
y = home_data.SalePrice
X = home_data[feature_names]

### Fitting model

In [55]:
from sklearn.tree import DecisionTreeRegressor

In [56]:
# Decision Tree is a heuristic model and the model results that
# one get today might not be the same as if the model is run some
# time in future (it being an NP-complete problem is the reason why)
# Thus, in order for it to yeild same results, the value of random_state
# is set. The value itself is not important as long as it is there

# in this example the model is trained and tested on the same data, which
# is not a good idea. The next section is about choosing training and testing data
iowa_model = DecisionTreeRegressor(random_state = 1)
iowa_model.fit(X, y)

predictions = iowa_model.predict(X)
print(predictions)

[208500. 181500. 223500. ... 266500. 142125. 147500.]


### Avoiding underfitting and overfitting

In [57]:
# A tree with 2 layers will have 2^2 (2^depth_of_tree) number of leaves
# and the more leaves there are the more complicated the model becomes (overfitting)
# and if it is too less then underfitting can happen. In order to avoid that,
# the number of leaves are chosen by an algorithm, by passing in some numbers and
# picking the one with least mae

from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

In [58]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
for max_leaf_nodes in candidate_max_leaf_nodes:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  	 Mean Absolute Error:  35044
Max leaf nodes: 25  	 Mean Absolute Error:  29016
Max leaf nodes: 50  	 Mean Absolute Error:  27405
Max leaf nodes: 100  	 Mean Absolute Error:  27282
Max leaf nodes: 250  	 Mean Absolute Error:  27893
Max leaf nodes: 500  	 Mean Absolute Error:  29454


In [59]:
# looking at the above output, the best number of nodes is the one with least MAE
number_of_nodes = 100

In [60]:
# It is important that the train and test data is separated
# There is not a single correct answer on how to do it, which is why
# train_test_split module is used. The split is based on a random number generator. 
# Supplying a numeric value to the random_state argument guarantees
# we get the same split every time we run this script.
from sklearn.model_selection import train_test_split

In [111]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)

iowa_model = DecisionTreeRegressor(max_leaf_nodes=number_of_nodes, random_state = 1)
iowa_model.fit(train_X, train_y)

predictions = iowa_model.predict(val_X)

for ind, prices in enumerate(val_X[:5]['Id']):
    print("Actual: %d \t Predicted: %d" % (home_data.loc[home_data['Id'] == prices]['SalePrice'], predictions[ind]))

Actual: 231500 	 Predicted: 181225
Actual: 179500 	 Predicted: 173500
Actual: 122000 	 Predicted: 122142
Actual: 84500 	 Predicted: 77155
Actual: 142000 	 Predicted: 148515


### Validation

In [112]:
# error = actual - predicted
# In MAE, the absolute values of erros is taken, which converts the
# error in a positive number, of which, the avg is taken
# It basically means, "On average, our predictions are off by X"
from sklearn.metrics import mean_absolute_error

In [113]:
val_mae = mean_absolute_error(val_y, predictions)
print(val_mae)

27967.997329237485


## Using RandomForest algo

In [None]:
# The random forest uses many trees, and it makes a prediction by averaging 
# the predictions of each component tree. It generally has much better predictive 
# accuracy than a single decision tree and it works well with default parameters. 
# There are many more models with even better performance, 
# but many of those are sensitive to getting the right parameters.

In [115]:
from sklearn.ensemble import RandomForestRegressor

# Define the model. Set random_state to 1
rf_model = RandomForestRegressor(random_state = 1)

# fit your model
rf_model.fit(train_X, train_y)
rf_val_pred = rf_model.predict(val_X)

# Calculate the mean absolute error of your Random Forest model on the validation data
rf_val_mae = mean_absolute_error(val_y, rf_val_pred)

print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))

Validation MAE for Random Forest Model: 22216.7021369863
