In [56]:
from sklearn.datasets import load_boston
import numpy as np
import pandas as pd

### Find some details about the data

In [57]:
boston_data=load_boston()
print(boston_data.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [58]:
boston_dataframe = pd.DataFrame(data=boston_data.data, columns=boston_data.feature_names)

boston_dataframe.head(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03


**Dataframe Structure ( No.of Instances and features)**

In [59]:
print(boston_dataframe.shape)

(506, 13)


**Find the index of the dataset**

In [60]:
boston_dataframe.index

RangeIndex(start=0, stop=506, step=1)

**Find the statistical summary of the data**

In [61]:
print(boston_dataframe.describe())

             CRIM          ZN       INDUS        CHAS         NOX          RM  \
count  506.000000  506.000000  506.000000  506.000000  506.000000  506.000000   
mean     3.613524   11.363636   11.136779    0.069170    0.554695    6.284634   
std      8.601545   23.322453    6.860353    0.253994    0.115878    0.702617   
min      0.006320    0.000000    0.460000    0.000000    0.385000    3.561000   
25%      0.082045    0.000000    5.190000    0.000000    0.449000    5.885500   
50%      0.256510    0.000000    9.690000    0.000000    0.538000    6.208500   
75%      3.677083   12.500000   18.100000    0.000000    0.624000    6.623500   
max     88.976200  100.000000   27.740000    1.000000    0.871000    8.780000   

              AGE         DIS         RAD         TAX     PTRATIO           B  \
count  506.000000  506.000000  506.000000  506.000000  506.000000  506.000000   
mean    68.574901    3.795043    9.549407  408.237154   18.455534  356.674032   
std     28.148861    2.1057

**Print first five rows and last three rows**

In [62]:
boston_dataframe.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [63]:
boston_dataframe.tail(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.9,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48
505,0.04741,0.0,11.93,0.0,0.573,6.03,80.8,2.505,1.0,273.0,21.0,396.9,7.88


**Add the target variable into the dataframe we created**

In [64]:
boston_dataframe['MEDV'] = pd.DataFrame(load_boston().target)

boston_dataframe

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


### Data preprocessing

**Check if the dataset contains missing values**

In [65]:
boston_dataframe.isnull().sum().sort_values(ascending=False)[:10]

MEDV       0
LSTAT      0
B          0
PTRATIO    0
TAX        0
RAD        0
DIS        0
AGE        0
RM         0
NOX        0
dtype: int64

We can see that there are no null values here. In the hotel booking dataset there may be missing values. In that case, use drop() to delete those rows. You can find the explanation and usage of drop() here:

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop.html

**Check the datatype of the different columns**

In [66]:
boston_dataframe.dtypes

CRIM       float64
ZN         float64
INDUS      float64
CHAS       float64
NOX        float64
RM         float64
AGE        float64
DIS        float64
RAD        float64
TAX        float64
PTRATIO    float64
B          float64
LSTAT      float64
MEDV       float64
dtype: object

We can see that all the columns have datatype float. In case the datatype is different, change them to float using astype() with parameter as float. You can find the explanation and usage of astype() here:

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.astype.html

### Exploratory Data Analysis

**Remove unwanted features**

Use the drop() to drop the unwanted columns. Pass the columns as a list to drop(). You can find the explanation and usage of drop() here:

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop.html

**Find the categorical values and remove them as well from the dataset**

In [67]:
categorical_features = list(boston_dataframe.columns[boston_dataframe.dtypes == object])

In [68]:
categorical_features

[]

Here, we do not have any categorical features. But in the hotel booking dataset, we will have categorical features. Drop the categorical features in the hotel booking dataset as well

###  Create decision tree models for both regression and classification

**Train and test the data**

In [69]:
X = boston_dataframe.drop(['MEDV'], axis=1)
Y = boston_dataframe['MEDV']               

In [70]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X,Y,random_state=0)

**Use decision tree model (regression) to test the data**

In [71]:
from sklearn.tree import DecisionTreeRegressor

clf = DecisionTreeRegressor(random_state=0)
clf.fit(x_train,y_train)
test_score = clf.score(x_test,y_test)
print("Accuracy:", test_score)

Accuracy: 0.6125205090458208


**Predict the value of the 10th record in x_train**

In [72]:
prediction = clf.predict(x_train.iloc[10].values.reshape(1,-1))

## Actual Value of 10th record of x_train from y_train
actual_value = y_train.iloc[10]

print(f'Predicted Value \t: {prediction[0]}')
print(f'Actual Value\t\t: {actual_value}')

Predicted Value 	: 13.4
Actual Value		: 13.4


**Use decision tree model (classification) to test the data**

In [73]:
# This may throw an error on this dataset since this Boston dataset is useful for regression. However, in the hotel booking 
# dataset, this will work 

from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=0)
clf.fit(x_train,y_train)
test_score = clf.score(x_test,y_test)
print("Accuracy:", test_score)

ValueError: Unknown label type: 'continuous'