<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/_Python_Decision_Trees%2C_Model_Validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Python Decision Trees, Model Validation

## Set up

In [3]:
# get data from kaggle
  # first, mount google drive; click on file folder icon on left, then mount icon

! pip install kaggle                                                # install kaggle library; may not need this? try without next time
! mkdir ~/.kaggle                                                   # create directory named kaggle; may not need this? try without next time
! cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json       # copy .json file from mounted drive to current instance; probably need this, but try without

! kaggle competitions download home-data-for-ml-course -f train.csv # download train.csv from home-data-for-ml-course

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
mkdir: cannot create directory ‘/root/.kaggle’: File exists
train.csv: Skipping, found more recently modified local copy (use --force to force download)


In [16]:
# libraries needed
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeRegressor       # to fit decision tree
from sklearn.metrics import mean_absolute_error      # to caluclate MAE
from sklearn.model_selection import train_test_split # to split in training and test

In [17]:
# load data
home_data = pd.read_csv('train.csv')

# view
home_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Fit decision tree on entire dataset

In [18]:
# set prediction target y
y = home_data['SalePrice']
print(y)

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64


In [19]:
# set features, X, to predict y
X = home_data[['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']]
print(X)

      LotArea  YearBuilt  1stFlrSF  2ndFlrSF  FullBath  BedroomAbvGr  \
0        8450       2003       856       854         2             3   
1        9600       1976      1262         0         2             3   
2       11250       2001       920       866         2             3   
3        9550       1915       961       756         1             3   
4       14260       2000      1145      1053         2             4   
...       ...        ...       ...       ...       ...           ...   
1455     7917       1999       953       694         2             3   
1456    13175       1978      2073         0         2             3   
1457     9042       1941      1188      1152         2             4   
1458     9717       1950      1078         0         1             2   
1459     9937       1965      1256         0         1             3   

      TotRmsAbvGrd  
0                8  
1                6  
2                6  
3                7  
4                9  
...      

In [20]:
# fit decision tree model
iowa_model = DecisionTreeRegressor()
iowa_model.fit(X, y)

DecisionTreeRegressor()

In [21]:
# view predicted sale prices and compare to actual for some of the data
print(iowa_model.predict(X.head()))  # predicted
print(y.head().tolist())             # actual

[208500. 181500. 223500. 140000. 250000.]
[208500, 181500, 223500, 140000, 250000]


In [22]:
# calculate mean absolute error
mean_absolute_error(iowa_model.predict(X), y)

62.35433789954339

## Fit decision on training, validate on test

In [23]:
# split into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y , random_state = 1)  # defaults to 75% train, 25% test

In [27]:
# understand how it was split
  # for X
print(X.shape)
print(X_train.shape)
print(X_test.shape)

  # for y
print(y.shape)
print(y_train.shape)
print(y_test.shape)

(1460, 7)
(1095, 7)
(365, 7)
(1460,)
(1095,)
(365,)


In [30]:
# fit model on training set
iowa_model_train = DecisionTreeRegressor(random_state = 1)
iowa_model_train.fit(X_train, y_train)

DecisionTreeRegressor(random_state=1)

In [32]:
# predicted values
predicted_y = iowa_model_train.predict(X_test)
print(predicted_y)

[186500. 184000. 130000.  92000. 164500. 220000. 335000. 144152. 215000.
 262000. 180000. 121000. 175900. 210000. 248900. 131000. 100000. 149350.
 235000. 156000. 149900. 265979. 193500. 377500. 100000. 162900. 145000.
 180000. 582933. 146000. 140000.  91500. 112500. 113000. 145000. 312500.
 110000. 132000. 305000. 128000. 162900. 115000. 110000. 124000. 215200.
 180000.  79000. 192000. 282922. 235000. 132000. 325000.  80000. 237000.
 208300. 100000. 120500. 162000. 153000. 187000. 185750. 335000. 129000.
 124900. 185750. 133700. 127000. 230000. 146800. 157900. 136000. 153575.
 335000. 177500. 143000. 202500. 168500. 105000. 305900. 192000. 190000.
 140200. 134900. 128950. 213000. 108959. 149500. 190000. 175900. 160000.
 250580. 157000. 120500. 147500. 118000. 117000. 110000. 130000. 148500.
 148000. 190000. 130500. 127000. 120500. 135000. 168000. 176432. 128000.
 147000. 260000. 132000. 129500. 171000. 181134. 227875. 189000. 282922.
  94750. 185000. 194000. 159000. 279500. 290000. 13

In [33]:
# calcluate MAE
mean_absolute_error(predicted_y, y_test)

29652.931506849316