## Decision Tree

## 0.Data

In [1]:
import pandas as pd

In [2]:
house = pd.read_excel("http://byungwan.com/class/House_Prices.xls")
house.head()

Unnamed: 0,HomeID,Price,SqFt,Bedrooms,Bathrooms,Offers,Brick,Neighborhood
0,1,114300,1790,2,2,2,No,East
1,2,114200,2030,4,2,3,No,East
2,3,114800,1740,3,2,1,No,East
3,4,94700,1980,3,2,3,No,East
4,5,119800,2130,3,3,3,No,East


In [3]:
house2 = pd.get_dummies(house, columns=["Brick", "Neighborhood"], drop_first=True, dtype = int)
house2.head()

Unnamed: 0,HomeID,Price,SqFt,Bedrooms,Bathrooms,Offers,Brick_Yes,Neighborhood_North,Neighborhood_West
0,1,114300,1790,2,2,2,0,0,0
1,2,114200,2030,4,2,3,0,0,0
2,3,114800,1740,3,2,1,0,0,0
3,4,94700,1980,3,2,3,0,0,0
4,5,119800,2130,3,3,3,0,0,0


In [4]:
house2["House_Class"] = pd.qcut(house["Price"], 2, labels = [0, 1])

In [5]:
house2.head()

Unnamed: 0,HomeID,Price,SqFt,Bedrooms,Bathrooms,Offers,Brick_Yes,Neighborhood_North,Neighborhood_West,House_Class
0,1,114300,1790,2,2,2,0,0,0,0
1,2,114200,2030,4,2,3,0,0,0,0
2,3,114800,1740,3,2,1,0,0,0,0
3,4,94700,1980,3,2,3,0,0,0,0
4,5,119800,2130,3,3,3,0,0,0,0


In [6]:
house2.columns.get_loc("House_Class")

9

In [7]:
y = house2.iloc[:, 9]
x = house2.iloc[:, 2:9]

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

### 1. Decision Tree

In [10]:
from sklearn.tree import DecisionTreeClassifier

In [11]:
dtc = DecisionTreeClassifier(criterion="entropy", max_depth=3)

In [12]:
dtc_model = dtc.fit(x_train, y_train)

In [13]:
y_pred = dtc_model.predict(x_test)

In [14]:
from sklearn import metrics

In [15]:
metrics.accuracy_score(y_test, y_pred)

0.7692307692307693

### cf.

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=100)

In [17]:
dtc_model = dtc.fit(x_train, y_train)

In [18]:
y_pred = dtc_model.predict(x_test)

In [19]:
metrics.accuracy_score(y_test, y_pred)

0.8974358974358975

In [20]:
# when we are evaluating prediction, we don't know what to go with...?
# to avoid this, the method is called K-Fold Cross Validation

### 2. Cross Validation

In [21]:
from sklearn.model_selection import cross_val_score

In [22]:
scores = cross_val_score(dtc,x,y, cv=5)

In [23]:
scores

array([0.92307692, 0.84615385, 0.96153846, 0.88      , 0.76      ])

In [24]:
import numpy as np

In [25]:
np.mean(scores)

np.float64(0.8741538461538461)

In [26]:
np.std(scores)

np.float64(0.06910689772815351)

In [27]:
print(np.std(scores))

0.06910689772815351


### 3. Unbalanced data

In [28]:
house2.head()

Unnamed: 0,HomeID,Price,SqFt,Bedrooms,Bathrooms,Offers,Brick_Yes,Neighborhood_North,Neighborhood_West,House_Class
0,1,114300,1790,2,2,2,0,0,0,0
1,2,114200,2030,4,2,3,0,0,0,0
2,3,114800,1740,3,2,1,0,0,0,0
3,4,94700,1980,3,2,3,0,0,0,0
4,5,119800,2130,3,3,3,0,0,0,0


In [29]:
house2["House_Class"].value_counts()

House_Class
0    64
1    64
Name: count, dtype: int64

In [30]:
house2["House_Class2"] = pd.cut(house2["Price"], bins=(0,100000, 250000), labels=[0,1])

In [31]:
house2.head()

Unnamed: 0,HomeID,Price,SqFt,Bedrooms,Bathrooms,Offers,Brick_Yes,Neighborhood_North,Neighborhood_West,House_Class,House_Class2
0,1,114300,1790,2,2,2,0,0,0,0,1
1,2,114200,2030,4,2,3,0,0,0,0,1
2,3,114800,1740,3,2,1,0,0,0,0,1
3,4,94700,1980,3,2,3,0,0,0,0,0
4,5,119800,2130,3,3,3,0,0,0,0,1


In [32]:
house2["House_Class2"].value_counts()

House_Class2
1    116
0     12
Name: count, dtype: int64

In [34]:
y1 = house2.iloc[:, 9]
y2 = house2.iloc[:, 10]
x = house2.iloc[:, 2:9]

In [35]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(x, y1, test_size=0.3, random_state=0)

In [36]:
dtc_model1 = dtc.fit(x_train1, y_train1)

In [37]:
y_pred1 = dtc_model1.predict(x_test1)

In [38]:
metrics.accuracy_score(y_test1, y_pred1)

0.7692307692307693

In [39]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(x, y2, test_size=0.3, random_state=0)

In [40]:
dtc_model2 = dtc.fit(x_train2, y_train2)

In [41]:
y_pred2 = dtc_model2.predict(x_test2)

In [43]:
metrics.accuracy_score(y_test2, y_pred2)

0.8717948717948718

In [44]:
# inbalace - majority case more often so that the accuracy score increase

## Example 2

In [47]:
coupon = pd.read_excel("http://byungwan.com/class/Coupon_Sample.xlsx")

In [48]:
coupon["Discount"].value_counts()

Discount
0    377
1    122
Name: count, dtype: int64

### Up-Sampling

In [49]:
from sklearn.utils import resample

In [51]:
coupon0 = coupon[coupon["Discount"] == 0]

In [53]:
coupon1 = coupon[coupon["Discount"] == 1]

In [54]:
coupon1["Discount"].value_counts()

Discount
1    122
Name: count, dtype: int64

In [56]:
coupon1_upsampled = resample(coupon1, replace=True, n_samples=377, random_state=0)

In [57]:
coupon1_upsampled["Discount"].value_counts()

Discount
1    377
Name: count, dtype: int64

In [59]:
coupon_upsampled = pd.concat([coupon0, coupon1_upsampled], axis=0)

In [60]:
coupon_upsampled["Discount"].value_counts()

Discount
0    377
1    377
Name: count, dtype: int64

### Down-sampling

In [62]:
coupon0_downsampled = resample(coupon0, replace=False, n_samples=122, random_state=0)

In [63]:
coupon_downsampled = pd.concat([coupon0_downsampled, coupon1], axis=0)

In [64]:
coupon_downsampled["Discount"].value_counts()

Discount
0    122
1    122
Name: count, dtype: int64