## Underfitting and Overfitting
Dataset: Home prices in Melbourne, Australia

In [1]:
import pandas as pd

In [2]:
melbourne_prices = 'Datasets/melb_data.csv'
price_data = pd.read_csv(melbourne_prices)

price_data.dropna(axis=0)

y = price_data.Price

melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = price_data[melbourne_features]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y , random_state=42)

In [3]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, max_depth, X_train, X_test, y_train, y_test):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, random_state=42)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    mea = mean_absolute_error(y_test, pred)
    
    return mea

In [4]:
best = {}
for max_leaf_nodes in [5, 50, 500, 510, 515, 520]:
    for max_depth in [10, 15, 20, 30, None]:
        mae = get_mae(max_leaf_nodes, max_depth, X_train, X_test, y_train, y_test)
        print("Max leaf nodes: {0}  \t Max depth: {1}   \t Mean Absolute Error:  {2}".format(max_leaf_nodes, max_depth, mae))
        best[mae] = [max_leaf_nodes, max_depth]

Max leaf nodes: 5  	 Max depth: 10   	 Mean Absolute Error:  353119.2126692225
Max leaf nodes: 5  	 Max depth: 15   	 Mean Absolute Error:  353119.2126692225
Max leaf nodes: 5  	 Max depth: 20   	 Mean Absolute Error:  353119.2126692225
Max leaf nodes: 5  	 Max depth: 30   	 Mean Absolute Error:  353119.2126692225
Max leaf nodes: 5  	 Max depth: None   	 Mean Absolute Error:  353119.2126692225
Max leaf nodes: 50  	 Max depth: 10   	 Mean Absolute Error:  259147.3027976146
Max leaf nodes: 50  	 Max depth: 15   	 Mean Absolute Error:  255724.2842347583
Max leaf nodes: 50  	 Max depth: 20   	 Mean Absolute Error:  255724.2842347583
Max leaf nodes: 50  	 Max depth: 30   	 Mean Absolute Error:  255724.2842347583
Max leaf nodes: 50  	 Max depth: None   	 Mean Absolute Error:  255724.2842347583
Max leaf nodes: 500  	 Max depth: 10   	 Mean Absolute Error:  236595.5919407878
Max leaf nodes: 500  	 Max depth: 15   	 Mean Absolute Error:  221946.70481048163
Max leaf nodes: 500  	 Max depth: 20  

In [5]:
minval = min(best.keys())
res = [k for k, v in best.items() if k==minval]
print(res)
print("Best parameters: ")
print("Max leaf nodes: {0}  \t Max depth: {1}   \t Mean Absolute Error:  {2}".format(best[res[0]][0], best[res[0]][1], res))

[221565.52408127012]
Best parameters: 
Max leaf nodes: 515  	 Max depth: 15   	 Mean Absolute Error:  [221565.52408127012]
