In [1]:
## Inference vs Prediction
# Prediction - focusing on prediction (classification)
# We put everything and see what happens / Don't worry about 

# Focus was on inference in regression but we can predict with regression
# Don't care about what a0, a1..
# Linear regression - prediction / finding the y_hat

In [2]:
### Confusion Matrix (TP, TN, FP, FN)
## Predict the class
# y=0, y=1 / house class (given y can be 1 or 0)
# accuracy = (TP+TN)/Total
# Precision, Recall - how accurately it will work

In [None]:
# Accuracy / Recall -> saying different information
# Depending on the context, choose which index is important
# Tradeoff between FN & FP

## Decision Tree

In [3]:
import pandas as pd

In [6]:
house = pd.read_excel("http://byungwan.com/class/House_Prices.xls")

In [7]:
house.head()

Unnamed: 0,HomeID,Price,SqFt,Bedrooms,Bathrooms,Offers,Brick,Neighborhood
0,1,114300,1790,2,2,2,No,East
1,2,114200,2030,4,2,3,No,East
2,3,114800,1740,3,2,1,No,East
3,4,94700,1980,3,2,3,No,East
4,5,119800,2130,3,3,3,No,East


In [8]:
# predict the house prices - not what we did before (not regression model)
# build a classification model

In [9]:
# needs to be categorical -> convert continuous to categorical

### 1. Convert numerical to categorical

In [13]:
house["House_Class"] = pd.qcut(house["Price"],2, labels=[0,1])
# 0 - lower side group, 1 - higher side group

In [16]:
house.head()

Unnamed: 0,HomeID,Price,SqFt,Bedrooms,Bathrooms,Offers,Brick,Neighborhood,House_Class
0,1,114300,1790,2,2,2,No,East,0
1,2,114200,2030,4,2,3,No,East,0
2,3,114800,1740,3,2,1,No,East,0
3,4,94700,1980,3,2,3,No,East,0
4,5,119800,2130,3,3,3,No,East,0


### 2. Convert categorical to numerical

In [18]:
house = pd.get_dummies(house, columns=["Brick","Neighborhood"], drop_first=True, dtype=int)

In [19]:
house.head()

Unnamed: 0,HomeID,Price,SqFt,Bedrooms,Bathrooms,Offers,House_Class,Brick_Yes,Neighborhood_North,Neighborhood_West
0,1,114300,1790,2,2,2,0,0,0,0
1,2,114200,2030,4,2,3,0,0,0,0
2,3,114800,1740,3,2,1,0,0,0,0
3,4,94700,1980,3,2,3,0,0,0,0
4,5,119800,2130,3,3,3,0,0,0,0


### 3. Define Y & X

In [20]:
house.columns.get_loc("House_Class")

6

In [21]:
y = house.iloc[:, 6]

In [22]:
x = house.iloc[:, [2,3,4,5,7,8,9]]

In [24]:
x.head()

Unnamed: 0,SqFt,Bedrooms,Bathrooms,Offers,Brick_Yes,Neighborhood_North,Neighborhood_West
0,1790,2,2,2,0,0,0
1,2030,4,2,3,0,0,0
2,1740,3,2,1,0,0,0
3,1980,3,2,3,0,0,0
4,2130,3,3,3,0,0,0


In [25]:
from sklearn.model_selection import train_test_split

In [26]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) 
# I don't know how it shuffles, but same shuffle is saved by the same number

### 4. Decision Tree

In [27]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
# use ID3 or Gini Index

In [28]:
dtc = DecisionTreeClassifier(criterion="entropy")

In [29]:
dt_model = dtc.fit(x_train, y_train)

In [31]:
y_pred = dt_model.predict(x_test)

In [32]:
from sklearn import metrics

In [33]:
metrics.accuracy_score(y_test, y_pred)

0.8205128205128205

In [34]:
metrics.confusion_matrix(y_test, y_pred)

array([[14,  3],
       [ 4, 18]])

In [35]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.82      0.80        17
           1       0.86      0.82      0.84        22

    accuracy                           0.82        39
   macro avg       0.82      0.82      0.82        39
weighted avg       0.82      0.82      0.82        39

