In [89]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

In [90]:
# CRIM: Crime per capita
# ZN: Proportion of residential land zoned for lots over 25,000 sq.ft.
# INDUS: Proportion of non-retail business acres per town
# CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
# NOX: Nitric oxides concentration (parts per 10 million)
# RM: Average number of rooms per dwelling
# AGE: Proportion of owner-occupied units built prior to 1940
# DIS: Weighted distances to ﬁve Boston employment centers
# RAD: Index of accessibility to radial highways
# TAX: Full-value property-tax rate per $10,000
# PTRAIO: Pupil-teacher ratio by town
# LSTAT: Percent lower status of the population
# MEDV: Median value of owner-occupied homes in $1000s

df = pd.read_csv('real_estate_data.csv')
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,,36.2


In [91]:
df.shape

(506, 13)

In [92]:
# isna() gives null or NAN values in all dataset in boolean 
# and sum() converts the boolean to 1 for True and adds them up for each column. 
df.isna().sum()

CRIM       20
ZN         20
INDUS      20
CHAS       20
NOX         0
RM          0
AGE        20
DIS         0
RAD         0
TAX         0
PTRATIO     0
LSTAT      20
MEDV        0
dtype: int64

In [93]:
# Data Pre-Processing
# drop the rows with missing values using dropna() and inplace = True modifies the existing df without creating a new df

df.dropna(inplace=True)

In [94]:
df.isna().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
LSTAT      0
MEDV       0
dtype: int64

In [95]:
X = df.drop(columns=['MEDV'])
y = df['MEDV']

In [96]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94
5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3,222,18.7,5.21


In [97]:
y.head()

0    24.0
1    21.6
2    34.7
3    33.4
5    28.7
Name: MEDV, dtype: float64

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [99]:
# first creating an DecisionTreeRegressor object using criterion= 'squared_error'
regression_tree = DecisionTreeRegressor(criterion= 'squared_error')

In [100]:
# Training
regression_tree.fit(X_train, y_train)

In [101]:
# Evaluation, gives r2 error
regression_tree.score(X_test, y_test)

0.6507654112710402

In [102]:
# calculating average error in our testing set
pred = regression_tree.predict(X_test)

# calculating the MAE and mutiplying by 1000 to match the actual target value
print('The average error in our prediction is $', (pred - y_test).abs().mean() * 1000)

The average error in our prediction is $ 3945.5696202531662


In [103]:
# Training again using criterion= 'absolute_error'
regression_tree = DecisionTreeRegressor(criterion='absolute_error')
regression_tree.fit(X_train, y_train)

In [104]:
regression_tree.score(X_test, y_test)

0.7058567627263429

In [105]:
from sklearn.metrics import r2_score
pred = regression_tree.predict(X_test)

print('The r2 error is $', r2_score(y_test, pred)) # another way of calculating r2 error
print('The average error is $', (pred - y_test).abs().mean() * 1000)

The r2 error is $ 0.7058567627263429
The average error is $ 3525.3164556962033
