<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/Python_Decision_Tree%2C_Underfitting_and_Overfitting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Python Decision Tree, Underfitting and Overfitting

## Set up

In [3]:
# get kaggle data

# first, mount google drive

# copy .json file
! cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json

# get data
! kaggle competitions download home-data-for-ml-course -f train.csv

Downloading train.csv to /content
100% 450k/450k [00:00<00:00, 1.53MB/s]
100% 450k/450k [00:00<00:00, 1.53MB/s]


In [4]:
# libraries needed
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [5]:
# get data into python
home_data = pd.read_csv('train.csv')
home_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
# descriptive stats about home_data
home_data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [8]:
# specify features, X, and prediction target, y
y = home_data['SalePrice']
X = home_data[['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']]

In [22]:
# divide home_data into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

## Function to MAE for different tree depths

In [23]:
def get_mae(max_leaf_nodes, X_train, X_test, y_train, y_test):
  """
  returns mean absolute error (mae) given 
  maximum number of leaf nodes and training and test data
  """

  # specify model
  decision_tree_model = DecisionTreeRegressor(max_leaf_nodes = max_leaf_nodes, random_state = 1)
  
  # fit model
  decision_tree_model.fit(X_train, y_train)

  # get predicted values
  y_predicted = decision_tree_model.predict(X_test)

  # return MAE
  mae = mean_absolute_error(y_predicted, y_test)
  return mae


## For loop to iterate through various tree depths

In [24]:
get_mae(5, X_train, X_test, y_train, y_test)

35044.51299744237

In [31]:
# iterate through these tree depths
depths = [5, 50, 500, 5000]
mae_values = []

for depth in depths:
  mae_depth = get_mae(max_leaf_nodes = depth, X_train = X_train, X_test = X_test, y_train = y_train, y_test = y_test)
  mae_values.append(mae_depth)


In [32]:
# view different mae valus
print(mae_values)

[35044.51299744237, 27405.930473214907, 28380.917944156296, 29001.372602739724]


## Exercise from Kaggle

In [37]:
# set up so you match the exercises

# load data
home_data = pd.read_csv('train.csv')

# define features, X, and prediction target, y
X = home_data[['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']]
y = home_data[['SalePrice']]

# split X and y into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

# specify model
iowa_model = DecisionTreeRegressor(random_state = 1)

# fit model
iowa_model.fit(X_train, y_train)

# predicted y
y_predicted = iowa_model.predict(X_test)

# MAE
print(mean_absolute_error(y_predicted, y_test))

29652.931506849316


In [38]:
# function to get MAE given training and test data and tree depth
def get_mae(max_leaf_nodes, X_train, X_test, y_train, y_test):
  """
  get MAE given training and test data and tree depth
  """
  # specify model
  model = DecisionTreeRegressor(max_leaf_nodes = max_leaf_nodes, random_state = 0)

  # fit model
  model.fit(X_train, y_train)

  # make predictions
  y_predicted = model.predict(X_test)

  # get and return MAE
  mae = mean_absolute_error(y_predicted, y_test)
  return mae

In [39]:
# iterate through these tree depths
tree_depths = [5, 25, 50, 100, 250, 500]

# put MAE corresponding to above tree_depths in this list
mae_list = []

# iterate through each tree depth to get corresponding MAE
for depth in tree_depths:
  mae = get_mae(max_leaf_nodes = depth, X_train = X_train, X_test = X_test, y_train = y_train, y_test = y_test)
  mae_list.append(mae)

In [40]:
print(mae_list)

[35044.51299744237, 29016.41319191076, 27405.930473214907, 27282.50803885739, 27893.822225701646, 29454.18598068598]


In [43]:
# view tree depth and corresponding MAE together
pd.DataFrame(tree_depths, mae_list)

Unnamed: 0,0
35044.512997,5
29016.413192,25
27405.930473,50
27282.508039,100
27893.822226,250
29454.185981,500


In [44]:
# todo above: make tree depth the index and pull index with smallest MAE

In [45]:
# specify final model
final_model = DecisionTreeRegressor(max_leaf_nodes = 100, random_state = 0)

# fit final model
final_model.fit(X, y)

DecisionTreeRegressor(max_leaf_nodes=100, random_state=0)